Merge branch 'open-compass:main' into main

2025-05-30 16:03:24 +08:00 · 2024-09-02 13:57:05 +08:00 · 2024-09-02 13:57:05 +08:00 · d66be56919
commit d66be56919
parent d72ca83102 9693be46b7
1834 changed files with 77680 additions and 1683 deletions
--- a/.github/scripts/eval_regression_base.py
+++ b/.github/scripts/eval_regression_base.py
@ -2,48 +2,57 @@ from mmengine.config import read_base

 with read_base():
    # choose a list of datasets
-    from ...configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
        gsm8k_datasets  # noqa: F401, E501
-    from ...configs.datasets.race.race_ppl import \
+    from opencompass.configs.datasets.race.race_ppl import \
        race_datasets  # noqa: F401, E501
-    from ...configs.models.deepseek.hf_deepseek_moe_16b_base import \
+    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
    # read hf models - chat models
-    from ...configs.models.deepseek.lmdeploy_deepseek_7b_base import \
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
-    from ...configs.models.deepseek.vllm_deepseek_moe_16b_base import \
+    from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
        models as vllm_deepseek_moe_16b_base_model  # noqa: F401, E501
-    from ...configs.models.gemma.hf_gemma_2b import \
+    from opencompass.configs.models.gemma.hf_gemma_2b import \
        models as hf_gemma_2b_model  # noqa: F401, E501
-    from ...configs.models.gemma.hf_gemma_7b import \
+    from opencompass.configs.models.gemma.hf_gemma_7b import \
        models as hf_gemma_7b_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
+        models as hf_internlm2_5_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
+        models as hf_internlm2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
+        models as hf_internlm2_base_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
        models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_7b import \
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
+        models as lmdeploy_internlm2_5_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \
        models as lmdeploy_internlm2_7b_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
        models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
-    from ...configs.models.hf_llama.lmdeploy_llama3_8b import \
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
        models as lmdeploy_llama3_8b_model  # noqa: F401, E501
-    from ...configs.models.mistral.hf_mistral_7b_v0_2 import \
+    from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
        models as hf_mistral_7b_v0_2_model  # noqa: F401, E501
-    from ...configs.models.mistral.vllm_mistral_7b_v0_2 import \
+    from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
        models as vllm_mistral_7b_v0_2_model  # noqa: F401, E501
-    from ...configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
+    from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
        models as hf_qwen1_5_moe_a2_7b_model  # noqa: F401, E501
-    from ...configs.models.qwen.hf_qwen2_0_5b import \
+    from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
        models as hf_qwen2_0_5b_model  # noqa: F401, E501
-    from ...configs.models.qwen.lmdeploy_qwen2_1_5b import \
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
        models as lmdeploy_qwen2_1_5b_model  # noqa: F401, E501
-    from ...configs.models.qwen.lmdeploy_qwen2_7b import \
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
        models as lmdeploy_qwen2_7b_model  # noqa: F401, E501
-    from ...configs.models.qwen.vllm_qwen1_5_0_5b import \
+    from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b import \
        models as vllm_qwen1_5_0_5b_model  # noqa: F401, E501
-    from ...configs.models.yi.hf_yi_1_5_6b import \
+    from opencompass.configs.models.yi.hf_yi_1_5_6b import \
        models as hf_yi_1_5_6b_model  # noqa: F401, E501
-    from ...configs.models.yi.hf_yi_1_5_9b import \
+    from opencompass.configs.models.yi.hf_yi_1_5_9b import \
        models as hf_yi_1_5_9b_model  # noqa: F401, E501
-    from ...configs.summarizers.medium import summarizer  # noqa: F401, E501
+    from opencompass.configs.summarizers.medium import \
+        summarizer  # noqa: F401, E501

 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
--- a/.github/scripts/eval_regression_chat.py
+++ b/.github/scripts/eval_regression_chat.py
@ -1,70 +1,105 @@
 from mmengine.config import read_base

+from opencompass.models import OpenAISDK
+
 with read_base():
    # choose a list of datasets
-    from ...configs.datasets.gsm8k.gsm8k_gen import \
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
        gsm8k_datasets  # noqa: F401, E501
-    from ...configs.datasets.race.race_gen import \
+    from opencompass.configs.datasets.race.race_gen import \
        race_datasets  # noqa: F401, E501
    # read hf models - chat models
-    from ...configs.models.baichuan.hf_baichuan2_7b_chat import \
+    from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
        models as hf_baichuan2_7b_chat_model  # noqa: F401, E501
-    from ...configs.models.chatglm.hf_glm4_9b_chat import \
+    from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
        models as hf_glm4_9b_chat_model  # noqa: F401, E501
-    from ...configs.models.deepseek.hf_deepseek_7b_chat import \
+    from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
        models as hf_deepseek_7b_chat_model  # noqa: F401, E501
-    from ...configs.models.deepseek.hf_deepseek_moe_16b_chat import \
+    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
        models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
-    from ...configs.models.deepseek.vllm_deepseek_7b_chat import \
+    from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
        models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
-    from ...configs.models.gemma.hf_gemma_2b_it import \
+    from opencompass.configs.models.gemma.hf_gemma_2b_it import \
        models as hf_gemma_2b_it_model  # noqa: F401, E501
-    from ...configs.models.gemma.hf_gemma_7b_it import \
+    from opencompass.configs.models.gemma.hf_gemma_7b_it import \
        models as hf_gemma_7b_it_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
+        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
        models as lmdeploy_internlm2_chat_1_8b_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
        models as lmdeploy_internlm2_chat_1_8b_sft_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
        models as lmdeploy_internlm2_chat_7b_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
        models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.vllm_internlm2_chat_7b import \
+    from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
        models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
-    from ...configs.models.hf_llama.hf_llama3_8b_instruct import \
+    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
        models as hf_llama3_8b_instruct_model  # noqa: F401, E501
-    from ...configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
        models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
-    from ...configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
        models as hf_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
-    from ...configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
+    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
        models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
-    from ...configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
+    from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
        models as hf_minicpm_2b_dpo_fp32_model  # noqa: F401, E501
-    from ...configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
+    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
        models as hf_minicpm_2b_sft_bf16_model  # noqa: F401, E501
-    from ...configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
+    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
        models as hf_minicpm_2b_sft_fp32_model  # noqa: F401, E501
-    from ...configs.models.phi.hf_phi_3_mini_4k_instruct import \
+    from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
        models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
-    from ...configs.models.phi.hf_phi_3_small_8k_instruct import \
+    from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
        models as hf_phi_3_mini_8k_instruct_model  # noqa: F401, E501
-    from ...configs.models.qwen.hf_qwen1_5_0_5b_chat import \
+    from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
        models as hf_qwen1_5_0_5b_chat_model  # noqa: F401, E501
-    from ...configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
        models as lmdeploy_qwen2_1_5b_instruct_model  # noqa: F401, E501
-    from ...configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
        models as lmdeploy_qwen2_7b_instruct_model  # noqa: F401, E501
-    from ...configs.models.qwen.vllm_qwen1_5_0_5b_chat import \
+    from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b_chat import \
        models as vllm_qwen1_5_0_5b_chat_model  # noqa: F401, E501
-    from ...configs.models.yi.hf_yi_1_5_6b_chat import \
+    from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import \
        models as hf_yi_1_5_6b_chat_model  # noqa: F401, E501
-    from ...configs.models.yi.hf_yi_1_5_9b_chat import \
+    from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
        models as hf_yi_1_5_9b_chat_model  # noqa: F401, E501
-    from ...configs.summarizers.medium import summarizer  # noqa: F401, E501
+    from opencompass.configs.summarizers.medium import \
+        summarizer  # noqa: F401, E501

 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+model_name = ''
+
+models.append(
+    dict(
+        abbr='lmdeploy-api-test',
+        type=OpenAISDK,
+        key='EMPTY',
+        openai_api_base='http://judgemodel:10001/v1',
+        path='compass_judger_internlm2_102b_0508',
+        tokenizer_path='internlm/internlm2_5-20b-chat',
+        rpm_verbose=True,
+        meta_template=api_meta_template,
+        query_per_second=50,
+        max_out_len=1024,
+        max_seq_len=4096,
+        temperature=0.01,
+        batch_size=128,
+        retry=3,
+    ))
+
 for d in datasets:
    d['reader_cfg']['test_range'] = '[0:100]'
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -8,22 +8,25 @@ output_path = 'regression_result_daily'

 chat_model_list = [
    'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
-    'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2-chat-1.8b-turbomind',
+    'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2_5-7b-chat-hf',
+    'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
    'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
-    'internlm2-chat-7b-sft-turbomind', 'llama-3-8b-instruct-hf',
-    'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
-    'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
-    'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
-    'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-turbomind',
-    'qwen2-7b-instruct-turbomind', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf'
+    'internlm2-chat-7b-sft-turbomind', 'internlm2_5-7b-chat-turbomind',
+    'llama-3-8b-instruct-hf', 'llama-3-8b-instruct-turbomind',
+    'mistral-7b-instruct-v0.2-hf', 'minicpm-2b-dpo-fp32-hf',
+    'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf',
+    'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf',
+    'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
+    'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'lmdeploy-api-test'
 ]
 base_model_list = [
    'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', 'gemma-2b-hf',
    'gemma-7b-hf', 'internlm2-1.8b-turbomind', 'internlm2-7b-turbomind',
-    'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
-    'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
-    'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind', 'yi-1.5-6b-hf',
-    'yi-1.5-9b-hf'
+    'internlm2_5-7b-turbomind', 'internlm2_5-7b-hf',
+    'internlm2-base-7b-turbomind', 'internlm2-base-7b-hf',
+    'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf',
+    'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind',
+    'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
 ]
 dataset_list = ['gsm8k', 'race-middle', 'race-high']

@ -77,6 +80,50 @@ class TestBase:
        assert_score(result_score, base_score)


+@pytest.mark.usefixtures('result_scores')
+class TestCmdCase:
+
+    @pytest.mark.case1
+    @pytest.mark.parametrize('model, dataset',
+                             [('internlm2_5-7b-hf', 'race-middle'),
+                              ('internlm2_5-7b-hf', 'race-high')])
+    def test_cmd_case1(self, result_scores, model, dataset):
+        if len(result_scores.keys()) != 1:
+            assert False, 'result is none'
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, 91)
+
+    @pytest.mark.case2
+    @pytest.mark.parametrize('model, dataset',
+                             [('internlm2_5-7b-chat-turbomind', 'race-middle'),
+                              ('internlm2_5-7b-chat-turbomind', 'race-high')])
+    def test_cmd_case2(self, result_scores, model, dataset):
+        if len(result_scores.keys()) != 1:
+            assert False, 'result is none'
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, 91)
+
+    @pytest.mark.case3
+    @pytest.mark.parametrize('model, dataset',
+                             [('internlm2_5-7b_hf', 'race-middle'),
+                              ('internlm2_5-7b_hf', 'race-high')])
+    def test_cmd_case3(self, result_scores, model, dataset):
+        if len(result_scores.keys()) != 1:
+            assert False, 'result is none'
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, 91)
+
+    @pytest.mark.case4
+    @pytest.mark.parametrize('model, dataset',
+                             [('internlm2_5-7b-chat_hf', 'race-middle'),
+                              ('internlm2_5-7b-chat_hf', 'race-high')])
+    def test_cmd_case4(self, result_scores, model, dataset):
+        if len(result_scores.keys()) != 1:
+            assert False, 'result is none'
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, 91)
+
+
 def assert_score(score, baseline):
    if score is None or score == '-':
        assert False, 'value is none'
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -3,6 +3,11 @@ baichuan2-7b-chat-hf:
    race-middle: 74
    race-high: 79

+glm-4-9b-chat-hf:
+    gsm8k: 75
+    race-middle: 88
+    race-high: 88
+
 deepseek-7b-chat-hf:
    gsm8k: 60
    race-middle: 74
@ -23,6 +28,16 @@ gemma-7b-it-hf:
    race-middle: 74
    race-high: 71

+internlm2_5-7b-chat-hf:
+    gsm8k: 86
+    race-middle: 92
+    race-high: 93
+
+internlm2_5-7b-chat-turbomind:
+    gsm8k: 87
+    race-middle: 92
+    race-high: 93
+
 internlm2-chat-1.8b-turbomind:
    gsm8k: 40
    race-middle: 82
@ -108,6 +123,10 @@ deepseek-moe-16b-base-hf:
    race-middle: 35
    race-high: 23

+lmdeploy-api-test:
+    gsm8k: 90
+    race-middle: 95
+    race-high: 96

 deepseek-7b-base-turbomind:
    gsm8k: 21
@ -124,8 +143,18 @@ gemma-7b-hf:
    race-middle: 59
    race-high: 66

+internlm2_5-7b-hf:
+    gsm8k: 46
+    race-middle: 92
+    race-high: 91
+
+internlm2_5-7b-turbomind:
+    gsm8k: 73
+    race-middle: 90
+    race-high: 91
+
 internlm2-1.8b-turbomind:
-    gsm8k: 27
+    gsm8k: 25
    race-middle: 75
    race-high: 72

@ -134,6 +163,11 @@ internlm2-7b-turbomind:
    race-middle: 78
    race-high: 76

+internlm2-base-7b-hf:
+    gsm8k: 2
+    race-middle: 71
+    race-high: 74
+
 internlm2-base-7b-turbomind:
    gsm8k: 39
    race-middle: 75
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -14,6 +14,7 @@ env:
  PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
  USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+  DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
  HF_DATASETS_OFFLINE: 1
  TRANSFORMERS_OFFLINE: 1
  HF_HUB_OFFLINE: 1
@ -28,54 +29,69 @@ jobs:
        uses: actions/checkout@v2
      - name: Prepare - create conda env and install torch
        run: |
-          eval "$(conda shell.bash hook)"
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda create -y --name ${{env.CONDA_ENV}} python=3.10
          conda activate ${{env.CONDA_ENV}}
-          pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
-          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.0.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
-          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl --index-url https://download.pytorch.org/whl/cu118
+          pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.2+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          pip install bitsandbytes
          pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
-          pip install xformers==0.0.25.post1 --cache-dir ${{env.PIP_CACHE_PATH}} --extra-index-url https://download.pytorch.org/whl/cu118
+          pip install xformers==0.0.25.post1 --cache-dir ${{env.PIP_CACHE_PATH}}
          conda info --envs
      - name: Prepare - Pip install code
        run: |
-          eval "$(conda shell.bash hook)"
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}
          pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
          pip install human_eval transformers protobuf pytest --cache-dir ${{env.PIP_CACHE_PATH}}
          conda info --envs
      - name: Prepare - prepare data and hf model
        run: |
-          cp -r ${{env.USERSPACE_PREFIX}}/data .
+          ln -s ${{env.DATEASET_CACHE_PATH}} data
          rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
      - name:  Run chat model test
        run: |
-          eval "$(conda shell.bash hook)"
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
-          rm -rf regression_result_daily
-          export from_tf=TRUE
-          rm -rf /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary
-          python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }} --reuse
-          cp -r /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary regression_result_daily
+          sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
+          python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat --reuse
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat/*/summary regression_result_daily
          python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run base model test
        run: |
-          eval "$(conda shell.bash hook)"
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
-          rm -rf regression_result_daily
-          export from_tf=TRUE
-          rm -rf /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary
-          python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }} --reuse
-          cp -r /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary regression_result_daily
+          python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base --reuse
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base/*/summary regression_result_daily
          python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Run command testcase
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda activate ${{env.CONDA_ENV}}
+          conda info --envs
+          export from_tf=TRUE
+          python tools/list_configs.py internlm2_5 mmlu
+          python run.py --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1 --reuse
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1/*/summary regression_result_daily
+          python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
+          python run.py --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2/*/summary regression_result_daily
+          python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
+          python run.py --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3/*/summary regression_result_daily
+          python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
+          python run.py --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4/*/summary regression_result_daily
+          python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Remove Conda Env
        if: always()
        run: |
-          cp -r regression_result_daily/* /cpfs01/user/qa-llm-cicd/report
-          eval "$(conda shell.bash hook)"
+          rm -rf regression_result_daily
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda env remove -y --name ${{env.CONDA_ENV}}
          conda info --envs

--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@ -35,7 +35,7 @@ jobs:
        uses: actions/checkout@v2
      - name: Prepare - Install opencompass
        run: |
-          eval "$(conda shell.bash hook)"
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}
          python3 -m pip uninstall opencompass -y
          python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip
@ -47,7 +47,7 @@ jobs:
          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
      - name:  Run test
        run: |
-          eval "$(conda shell.bash hook)"
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          rm -rf regression_result
@ -65,7 +65,7 @@ jobs:
      - name:  Uninstall opencompass
        if: always()
        run: |
-          eval "$(conda shell.bash hook)"
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}
          python3 -m pip uninstall opencompass -y
          conda info --envs
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
@ -10,25 +10,43 @@ exclude: |
      opencompass/datasets/teval/|
      opencompass/datasets/NPHardEval/|
      opencompass/datasets/TheoremQA|
-      docs/zh_cn/advanced_guides/compassbench_intro.md|
-      docs/zh_cn/advanced_guides/compassbench_v2_0.md
+      docs/zh_cn/advanced_guides/compassbench_intro.md |
+      docs/zh_cn/advanced_guides/compassbench_v2_0.md |
+      opencompass/configs/datasets/ |
+      opencompass/configs/models/|
+      opencompass/configs/summarizers/|
+      opencompass/configs/dataset_collections/ |
+      opencompass/utils/datasets.py |
+      opencompass/utils/datasets_info.py
    )
 repos:
  - repo: https://gitee.com/openmmlab/mirrors-flake8
    rev: 5.0.4
    hooks:
      - id: flake8
-        exclude: configs/
+        exclude: |
+            (?x)^(
+                configs/ |
+                example_scripts/
+            )
  - repo: https://gitee.com/openmmlab/mirrors-isort
    rev: 5.11.5
    hooks:
      - id: isort
-        exclude: configs/
+        exclude: |
+            (?x)^(
+                configs/ |
+                example_scripts/
+            )
  - repo: https://gitee.com/openmmlab/mirrors-yapf
    rev: v0.32.0
    hooks:
      - id: yapf
-        exclude: configs/
+        exclude: |
+            (?x)^(
+                configs/ |
+                example_scripts/
+            )
  - repo: https://gitee.com/openmmlab/mirrors-codespell
    rev: v2.2.1
    hooks:
@ -36,8 +54,10 @@ repos:
        exclude: |
            (?x)^(
                .*\.jsonl|
-                opencompass/datasets/subjective/mtbench101.py|
-                configs/
+                .*\.md.template|
+                configs/ |
+                opencompass/configs/ |
+                example_scripts/
            )
  - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
    rev: v4.3.0
@ -88,6 +108,53 @@ repos:
        pass_filenames: true
        require_serial: true
        files: ^configs/datasets
+  - repo: local
+    hooks:
+    -   id: update-dataset-suffix-pacakge
+        name: dataset suffix updater(package)
+        entry: ./tools/update_dataset_suffix.py
+        language: script
+        pass_filenames: false
+        # require_serial: true
+        # files: ^opencompass/configs/datasets
+        args:
+          - --root_folder
+          - opencompass/configs/datasets
+  - repo: local
+    hooks:
+    -   id: compare-configs-datasets
+        name: compare configs datasets
+        entry: ./tools/compare_configs.py
+        language: script
+        pass_filenames: false
+        # require_serial: true
+        args:
+          - configs/datasets
+          - opencompass/configs/datasets
+  - repo: local
+    hooks:
+    -   id: compare-configs-models
+        name: compare configs models
+        entry: ./tools/compare_configs.py
+        language: script
+        pass_filenames: false
+        # require_serial: true
+        args:
+          - configs/models
+          - opencompass/configs/models
+          - --ignore
+          - llama
+  - repo: local
+    hooks:
+    -   id: compare-configs-summarizers
+        name: compare configs summarizers
+        entry: ./tools/compare_configs.py
+        language: script
+        pass_filenames: false
+        # require_serial: true
+        args:
+          - configs/summarizers
+          - opencompass/configs/summarizers
  # - repo: https://github.com/open-mmlab/pre-commit-hooks
  #   rev: v0.2.0  # Use the ref you want to point at
  #   hooks:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -13,24 +13,42 @@ exclude: |
      opencompass/datasets/TheoremQA|
      opencompass/datasets/subjective/mtbench101.py|
      docs/zh_cn/advanced_guides/compassbench_intro.md |
-      docs/zh_cn/advanced_guides/compassbench_v2_0.md
+      docs/zh_cn/advanced_guides/compassbench_v2_0.md |
+      opencompass/configs/datasets/ |
+      opencompass/configs/models/|
+      opencompass/configs/summarizers/ |
+      opencompass/configs/dataset_collections/ |
+      opencompass/utils/datasets.py |
+      opencompass/utils/datasets_info.py
    )
 repos:
  - repo: https://github.com/PyCQA/flake8
    rev: 5.0.4
    hooks:
      - id: flake8
-        exclude: configs/
+        exclude: |
+            (?x)^(
+                configs/ |
+                example_scripts/
+            )
  - repo: https://github.com/PyCQA/isort
    rev: 5.11.5
    hooks:
      - id: isort
-        exclude: configs/
+        exclude: |
+            (?x)^(
+                configs/ |
+                example_scripts/
+            )
  - repo: https://github.com/pre-commit/mirrors-yapf
    rev: v0.32.0
    hooks:
      - id: yapf
-        exclude: configs/
+        exclude: |
+            (?x)^(
+                configs/ |
+                example_scripts/
+            )
  - repo: https://github.com/codespell-project/codespell
    rev: v2.2.1
    hooks:
@ -39,7 +57,9 @@ repos:
            (?x)^(
                .*\.jsonl|
                .*\.md.template|
-                configs/
+                configs/ |
+                opencompass/configs/ |
+                example_scripts/
            )
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.3.0
@ -90,6 +110,54 @@ repos:
        pass_filenames: true
        require_serial: true
        files: ^configs/datasets
+  - repo: local
+    hooks:
+    -   id: update-dataset-suffix-pacakge
+        name: dataset suffix updater(package)
+        entry: ./tools/update_dataset_suffix.py
+        language: script
+        pass_filenames: false
+        # require_serial: true
+        # files: ^opencompass/configs/datasets
+        args:
+          - --root_folder
+          - opencompass/configs/datasets
+  - repo: local
+    hooks:
+    -   id: compare-configs-datasets
+        name: compare configs datasets
+        entry: ./tools/compare_configs.py
+        language: script
+        pass_filenames: false
+        # require_serial: true
+        args:
+          - configs/datasets
+          - opencompass/configs/datasets
+  - repo: local
+    hooks:
+    -   id: compare-configs-models
+        name: compare configs models
+        entry: ./tools/compare_configs.py
+        language: script
+        pass_filenames: false
+        # require_serial: true
+        args:
+          - configs/models
+          - opencompass/configs/models
+          - --ignore
+          - llama
+  - repo: local
+    hooks:
+    -   id: compare-configs-summarizers
+        name: compare configs summarizers
+        entry: ./tools/compare_configs.py
+        language: script
+        pass_filenames: false
+        # require_serial: true
+        args:
+          - configs/summarizers
+          - opencompass/configs/summarizers
+
  # - repo: https://github.com/open-mmlab/pre-commit-hooks
  #   rev: v0.2.0  # Use the ref you want to point at
  #   hooks:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,2 @@
+recursive-include opencompass/configs *.py *.yml *.json *.txt *.md
+recursive-include opencompass/openicl/icl_evaluator/hf_metrics *.py
--- a/README.md
+++ b/README.md
@ -70,8 +70,11 @@ Just like a compass guides us on our journey, OpenCompass will guide you through

 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>

+- **\[2024.08.20\]** OpenCompass now supports the [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists. 🔥🔥🔥
+- **\[2024.08.16\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations. Check out the [RULER](configs/datasets/ruler/README.md) evaluation config now! 🔥🔥🔥
+- **\[2024.08.09\]** We have released the example data and configuration for the CompassBench-202408, welcome to [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) for more details. 🔥🔥🔥
+- **\[2024.08.01\]** We supported the [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315) models. Welcome to try! 🔥🔥🔥
 - **\[2024.07.23\]** We supported the [ModelScope](www.modelscope.cn) datasets, you can load them on demand without downloading all the data to your local disk. Welcome to try! 🔥🔥🔥
- **\[2024.07.17\]** We have released the example data and configuration for the CompassBench-202408, welcome to [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) for more details. 🔥🔥🔥
 - **\[2024.07.17\]** We are excited to announce the release of NeedleBench's [technical report](http://arxiv.org/abs/2407.11963). We invite you to visit our [support documentation](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html) for detailed evaluation guidelines. 🔥🔥🔥
 - **\[2024.07.04\]** OpenCompass now supports InternLM2.5, which has **outstanding reasoning capability**, **1M Context window and** and **stronger tool use**, you can try the models in [OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) and [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
 - **\[2024.06.20\]** OpenCompass now supports one-click switching between inference acceleration backends, enhancing the efficiency of the evaluation process. In addition to the default HuggingFace inference backend, it now also supports popular backends [LMDeploy](https://github.com/InternLM/lmdeploy) and [vLLM](https://github.com/vllm-project/vllm). This feature is available via a simple command-line switch and through deployment APIs. For detailed usage, see the [documentation](docs/en/advanced_guides/accelerator_intro.md).🔥🔥🔥.
@ -114,29 +117,51 @@ Below are the steps for quick installation and datasets preparation.

 ### 💻 Environment Setup

-#### Open-source Models with GPU
+We highly recommend using conda to manage your python environment.

-```bash
-conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
-conda activate opencompass
-git clone https://github.com/open-compass/opencompass opencompass
-cd opencompass
-pip install -e .
-```
+- #### Create your virtual environment

-#### API Models with CPU-only
+  ```bash
+  conda create --name opencompass python=3.10 -y
+  conda activate opencompass
+  ```

-```bash
-conda create -n opencompass python=3.10 pytorch torchvision torchaudio cpuonly -c pytorch -y
-conda activate opencompass
-git clone https://github.com/open-compass/opencompass opencompass
-cd opencompass
-pip install -e .
-# also please install requirements packages via `pip install -r requirements/api.txt` for API models if needed.
-```
+- #### Install OpenCompass via pip
+
+  ```bash
+    pip install -U opencompass
+
+    ## Full installation (with support for more datasets)
+    # pip install "opencompass[full]"
+
+    ## Environment with model acceleration frameworks
+    ## Manage different acceleration frameworks using virtual environments
+    ## since they usually have dependency conflicts with each other.
+    # pip install "opencompass[lmdeploy]"
+    # pip install "opencompass[vllm]"
+
+    ## API evaluation (i.e. Openai, Qwen)
+    # pip install "opencompass[api]"
+  ```
+
+- #### Install OpenCompass from source
+
+  If you want to use opencompass's latest features, or develop new features, you can also build it from source
+
+  ```bash
+    git clone https://github.com/open-compass/opencompass opencompass
+    cd opencompass
+    pip install -e .
+    # pip install -e ".[full]"
+    # pip install -e ".[vllm]"
+  ```

 ### 📂 Data Preparation

+You can choose one for the following method to prepare datasets.
+
+#### Offline Preparation
+
 You can download and extract the datasets with the following commands:

 ```bash
@ -145,12 +170,19 @@ wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/Ope
 unzip OpenCompassData-core-20240207.zip
 ```

-Also, use the [ModelScope](www.modelscope.cn) to load the datasets on demand.
+#### Automatic Download from OpenCompass
+
+We have supported download datasets automatic from the OpenCompass storage server. You can run the evaluation with extra `--dry-run` to download these datasets.
+Currently, the supported datasets are listed in [here](https://github.com/open-compass/opencompass/blob/main/opencompass/utils/datasets_info.py#L259). More datasets will be uploaded recently.
+
+#### (Optional) Automatic Download with ModelScope
+
+Also you can use the [ModelScope](www.modelscope.cn) to load the datasets on demand.

 Installation:

 ```bash
-pip install modelscope
+pip install modelscope[framework]
 export DATASET_SOURCE=ModelScope
 ```

@ -166,32 +198,63 @@ Some third-party features, like Humaneval and Llama, may require additional step

 ## 🏗️ ️Evaluation

-After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared, you can evaluate the performance of the LLaMA-7b model on the MMLU and C-Eval datasets using the following command:
+After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!

-```bash
-python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
-```
+- Your first evaluation with OpenCompass!

-Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
+  OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.

-```bash
-python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl -a lmdeploy
-```
+  ```bash
+  # CLI
+  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen

-OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
+  # Python scripts
+  opencompass ./configs/eval_chat_demo.py
+  ```

-```bash
-# List all configurations
-python tools/list_configs.py
-# List all configurations related to llama and mmlu
-python tools/list_configs.py llama mmlu
-```
+  You can find more script examples under [configs](./configs) folder.

-You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
+- API evaluation

-```bash
-python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
-```
+  OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
+
+  ```bash
+  export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
+  # CLI
+  opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
+
+  # Python scripts
+  opencompass ./configs/eval_api_demo.py
+  ```
+
+- Accelerated Evaluation
+
+  Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
+
+  ```bash
+  # CLI
+  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
+
+  # Python scripts
+  opencompass ./configs/eval_lmdeploy_demo.py
+  ```
+
+- Supported Models
+
+  OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
+
+  ```bash
+  # List all configurations
+  python tools/list_configs.py
+  # List all configurations related to llama and mmlu
+  python tools/list_configs.py llama mmlu
+  ```
+
+  If the model is not on the list but supported by Huggingface AutoModel class, you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
+
+  ```bash
+  opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
+  ```

 > \[!TIP\]
 >
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -69,8 +69,11 @@

 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>

+- **\[2024.08.20\]** OpenCompass 现已支持 [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists。 🔥🔥🔥
+- **\[2024.08.16\]** OpenCompass 现已支持全新的长上下文语言模型评估基准——[RULER](https://arxiv.org/pdf/2404.06654)。RULER 通过灵活的配置，提供了对长上下文包括检索、多跳追踪、聚合和问答等多种任务类型的评测，欢迎访问[RULER](configs/datasets/ruler/README.md)。🔥🔥🔥
+- **\[2024.07.23\]** 我们支持了[Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)模型，欢迎试用！🔥🔥🔥
 - **\[2024.07.23\]** 我们支持了[ModelScope](www.modelscope.cn)数据集，您可以按需加载，无需事先下载全部数据到本地，欢迎试用！🔥🔥🔥
- **\[2024.07.17\]** 我们发布了CompassBench-202408榜单的示例数据和评测规则，敬请访问 [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) 获取更多信息。 🔥🔥🔥
+- **\[2024.07.17\]** 我们发布了CompassBench-202407榜单的示例数据和评测规则，敬请访问 [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) 获取更多信息。 🔥🔥🔥
 - **\[2024.07.17\]** 我们正式发布 NeedleBench 的[技术报告](http://arxiv.org/abs/2407.11963)。诚邀您访问我们的[帮助文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/needleinahaystack_eval.html)进行评估。🔥🔥🔥
 - **\[2024.07.04\]** OpenCompass 现已支持 InternLM2.5， 它拥有卓越的推理性能、有效支持百万字超长上下文以及工具调用能力整体升级，欢迎访问[OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) 和 [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
 - **\[2024.06.20\]** OpenCompass 现已支持一键切换推理加速后端，助力评测过程更加高效。除了默认的HuggingFace推理后端外，还支持了常用的 [LMDeploy](https://github.com/InternLM/lmdeploy) 和 [vLLM](https://github.com/vllm-project/vllm) ，支持命令行一键切换和部署 API 加速服务两种方式，详细使用方法见[文档](docs/zh_cn/advanced_guides/accelerator_intro.md)。
@ -110,35 +113,54 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下

 <p align="right"><a href="#top">🔝返回顶部</a></p>

-## 🛠️ 安装
+## 🛠️ 安装指南

-下面展示了快速安装以及准备数据集的步骤。
+下面提供了快速安装和数据集准备的步骤。

-### 💻 环境配置
+### 💻 环境搭建

-#### 面向开源模型的GPU环境
+我们强烈建议使用 `conda` 来管理您的 Python 环境。

-```bash
-conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
-conda activate opencompass
-git clone https://github.com/open-compass/opencompass opencompass
-cd opencompass
-pip install -e .
-```
+- #### 创建虚拟环境

-#### 面向API模型测试的CPU环境
+  ```bash
+  conda create --name opencompass python=3.10 -y
+  conda activate opencompass
+  ```

-```bash
-conda create -n opencompass python=3.10 pytorch torchvision torchaudio cpuonly -c pytorch -y
-conda activate opencompass
-git clone https://github.com/open-compass/opencompass opencompass
-cd opencompass
-pip install -e .
-# 如果需要使用各个API模型，请 `pip install -r requirements/api.txt` 安装API模型的相关依赖
-```
+- #### 通过pip安装OpenCompass
+
+  ```bash
+  # 支持绝大多数数据集及模型
+  pip install -U opencompass
+
+  # 完整安装（支持更多数据集）
+  # pip install "opencompass[full]"
+
+  # 模型推理后端，由于这些推理后端通常存在依赖冲突，建议使用不同的虚拟环境来管理它们。
+  # pip install "opencompass[lmdeploy]"
+  # pip install "opencompass[vllm]"
+
+  # API 测试（例如 OpenAI、Qwen）
+  # pip install "opencompass[api]"
+  ```
+
+- #### 基于源码安装OpenCompass
+
+  如果希望使用 OpenCompass 的最新功能，也可以从源代码构建它：
+
+  ```bash
+  git clone https://github.com/open-compass/opencompass opencompass
+  cd opencompass
+  pip install -e .
+  # pip install -e ".[full]"
+  # pip install -e ".[vllm]"
+  ```

 ### 📂 数据准备

+#### 提前离线下载
+
 OpenCompass支持使用本地数据集进行评测，数据集的下载和解压可以通过以下命令完成：

 ```bash
@ -147,6 +169,13 @@ wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/Ope
 unzip OpenCompassData-core-20240207.zip
 ```

+#### 从 OpenCompass 自动下载
+
+我们已经支持从OpenCompass存储服务器自动下载数据集。您可以通过额外的 `--dry-run` 参数来运行评估以下载这些数据集。
+目前支持的数据集列表在[这里](https://github.com/open-compass/opencompass/blob/main/opencompass/utils/datasets_info.py#L259)。更多数据集将会很快上传。
+
+#### (可选) 使用 ModelScope 自动下载
+
 另外，您还可以使用[ModelScope](www.modelscope.cn)来加载数据集：
 环境准备：

@ -167,32 +196,59 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce

 ## 🏗️ ️评测

-确保按照上述步骤正确安装 OpenCompass 并准备好数据集后，可以通过以下命令评测 LLaMA-7b 模型在 MMLU 和 C-Eval 数据集上的性能：
+在确保按照上述步骤正确安装了 OpenCompass 并准备好了数据集之后，现在您可以开始使用 OpenCompass 进行首次评估！

-```bash
-python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
-```
+- ### 首次评测

-另外，如果想使用除了 HuggingFace 外的推理后端进行加速评测，如 LMDeploy 或 vLLM，可以通过以下命令。使用前请确保您已经安装了相应后端的软件包，以及模型支持使用该后端进行加速推理，更多内容见推理加速后端[文档](docs/zh_cn/advanced_guides/accelerator_intro.md)，下面以LMDeploy为例：
+  OpenCompass 支持通过命令行界面 (CLI) 或 Python 脚本来设置配置。对于简单的评估设置，我们推荐使用 CLI；而对于更复杂的评估，则建议使用脚本方式。你可以在configs文件夹下找到更多脚本示例。

-```bash
-python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl -a lmdeploy
-```
+  ```bash
+  # 命令行界面 (CLI)
+  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen

-OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
+  # Python 脚本
+  opencompass ./configs/eval_chat_demo.py
+  ```

-```bash
-# 列出所有配置
-python tools/list_configs.py
-# 列出所有跟 llama 及 mmlu 相关的配置
-python tools/list_configs.py llama mmlu
-```
+  你可以在[configs](./configs) 文件夹下找到更多的脚本示例。

-你也可以通过命令行去评测其它 HuggingFace 模型。同样以 LLaMA-7b 为例：
+- ### API评测

-```bash
-python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
-```
+  OpenCompass 在设计上并不区分开源模型与 API 模型。您可以以相同的方式或甚至在同一设置中评估这两种类型的模型。
+
+  ```bash
+  export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
+  # 命令行界面 (CLI)
+  opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
+
+  # Python 脚本
+  opencompass  ./configs/eval_api_demo.py
+  ```
+
+- ### 推理后端
+
+  另外，如果您想使用除 HuggingFace 之外的推理后端来进行加速评估，比如 LMDeploy 或 vLLM，可以通过以下命令进行。请确保您已经为所选的后端安装了必要的软件包，并且您的模型支持该后端的加速推理。更多信息，请参阅关于推理加速后端的文档 [这里](docs/zh_cn/advanced_guides/accelerator_intro.md)。以下是使用 LMDeploy 的示例：
+
+  ```bash
+  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
+  ```
+
+  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
+
+- ### 支持的模型
+
+  ```bash
+  # 列出所有配置
+  python tools/list_configs.py
+  # 列出所有跟 llama 及 mmlu 相关的配置
+  python tools/list_configs.py llama mmlu
+  ```
+
+  如果模型不在列表中但支持 Huggingface AutoModel 类，您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
+
+  ```bash
+  opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
+  ```

 通过命令行或配置文件，OpenCompass 还支持评测 API 或自定义模型，以及更多样化的评测策略。请阅读[快速开始](https://opencompass.readthedocs.io/zh_CN/latest/get_started/quick_start.html)了解如何运行一个评测任务。

--- a/configs/api_examples/eval_api_360.py
+++ b/configs/api_examples/eval_api_360.py
@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_baichuan.py
+++ b/configs/api_examples/eval_api_baichuan.py
@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_baidu.py
+++ b/configs/api_examples/eval_api_baidu.py
@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_bytedance.py
+++ b/configs/api_examples/eval_api_bytedance.py
@ -5,9 +5,9 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    # from .datasets.collections.chat_medium import datasets
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    # from opencompass.configs.datasets.collections.chat_medium import datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_doubao.py
+++ b/configs/api_examples/eval_api_doubao.py
@ -5,9 +5,9 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    # from .datasets.collections.chat_medium import datasets
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    # from opencompass.configs.datasets.collections.chat_medium import datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_minimax.py
+++ b/configs/api_examples/eval_api_minimax.py
@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_moonshot.py
+++ b/configs/api_examples/eval_api_moonshot.py
@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_nanbeige.py
+++ b/configs/api_examples/eval_api_nanbeige.py
@ -6,8 +6,8 @@ from opencompass.tasks import OpenICLInferTask


 with read_base():
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_pangu.py
+++ b/configs/api_examples/eval_api_pangu.py
@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_qwen.py
+++ b/configs/api_examples/eval_api_qwen.py
@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_sensetime.py
+++ b/configs/api_examples/eval_api_sensetime.py
@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_xunfei.py
+++ b/configs/api_examples/eval_api_xunfei.py
@ -5,9 +5,9 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    # from .datasets.collections.chat_medium import datasets
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    # from opencompass.configs.datasets.collections.chat_medium import datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_zhipu.py
+++ b/configs/api_examples/eval_api_zhipu.py
@ -5,9 +5,9 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    # from .datasets.collections.chat_medium import datasets
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    # from opencompass.configs.datasets.collections.chat_medium import datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/api_examples/eval_api_zhipu_v2.py
+++ b/configs/api_examples/eval_api_zhipu_v2.py
@ -5,9 +5,9 @@ from opencompass.runners.local_api import LocalAPIRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    # from .datasets.collections.chat_medium import datasets
-    from ..summarizers.medium import summarizer
-    from ..datasets.ceval.ceval_gen import ceval_datasets
+    # from opencompass.configs.datasets.collections.chat_medium import datasets
+    from opencompass.configs.summarizers.medium import summarizer
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets

 datasets = [
    *ceval_datasets,
--- a/configs/dataset_collections/chat_OC15.py
+++ b/configs/dataset_collections/chat_OC15.py
@ -1,22 +1,22 @@
 from mmengine.config import read_base

 with read_base():
-    from ..datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
-    from ..datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
-    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from ..datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
-    from ..datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
-    from ..datasets.nq.nq_open_1shot_gen_2e45e5 import nq_datasets
-    from ..datasets.race.race_gen_69ee4f import race_datasets
-    from ..datasets.winogrande.winogrande_5shot_gen_b36770 import winogrande_datasets
-    from ..datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
-    from ..datasets.bbh.bbh_gen_2879b0 import bbh_datasets
-    from ..datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from ..datasets.math.math_0shot_gen_393424 import math_datasets
-    from ..datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
-    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from ..datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets
-    from ..datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets
-    from ..datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
+    from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import nq_datasets
+    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import winogrande_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
+    from opencompass.configs.datasets.bbh.bbh_gen_2879b0 import bbh_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets
+    from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets

 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py
+++ b/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py
@ -0,0 +1,53 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess, match_answer_pattern
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+
+{question}
+
+A. {textA}
+B. {textB}
+C. {textC}
+D. {textD}
+""".strip()
+
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=QUERY_TEMPLATE)
+            ], ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+ARC_c_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+)
+
+ARC_c_datasets = [
+    dict(
+        abbr='ARC-c',
+        type=ARCDataset,
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg,
+    )
+]
--- a/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py
+++ b/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py
@ -0,0 +1,48 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey',
+)
+
+ARC_c_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin='</E>',
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:',
+                ),
+                dict(role='BOT', prompt='{answerKey}'),
+            ],
+        ),
+        ice_token='</E>',
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
+    inferencer=dict(type=GenInferencer, max_out_len=50),
+)
+
+ARC_c_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+
+ARC_c_datasets = [
+    dict(
+        abbr='ARC-c',
+        type=ARCDataset,
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg,
+    )
+]
--- a/configs/datasets/LCBench/README.md
+++ b/configs/datasets/LCBench/README.md
@ -0,0 +1,66 @@
+# LCBench2023
+
+LCBench2023 collects questions from leetcode weekly competitions between 2022 and 2023. It contains Chinese and English versions, each with 581 questions.
+
+## Base Models
+
+|          model           |   lcbench/pass@1 |   en/pass@1 |   cn/pass@1 |   lcbench/pass |   lcbench/timeout |   lcbench/failed |   lcbench/wrong_answer |   en/pass |   en/timeout |   en/failed |   en/wrong_answer |   cn/pass |   cn/timeout |   cn/failed |   cn/wrong_answer |
+|:------------------------:|-----------------:|------------:|------------:|---------------:|------------------:|-----------------:|-----------------------:|----------:|-------------:|------------:|------------------:|----------:|-------------:|------------:|------------------:|
+|    llama-7b-turbomind    |             1.30 |        2.61 |        0.00 |             15 |                28 |              843 |                    266 |        15 |           14 |         290 |               257 |         0 |           14 |         553 |                 9 |
+|   llama-13b-turbomind    |             2.09 |        4.17 |        0.00 |             24 |                31 |              823 |                    274 |        24 |           16 |         270 |               266 |         0 |           15 |         553 |                 8 |
+|   llama-30b-turbomind    |             3.48 |        6.78 |        0.17 |             40 |                41 |              780 |                    291 |        39 |           25 |         226 |               286 |         1 |           16 |         554 |                 5 |
+|   llama-65b-turbomind    |             4.00 |        7.83 |        0.17 |             46 |                22 |              755 |                    329 |        45 |           10 |         205 |               316 |         1 |           12 |         550 |                13 |
+|   llama-2-7b-turbomind   |             0.78 |        1.57 |        0.00 |              9 |                28 |              825 |                    290 |         9 |           16 |         274 |               277 |         0 |           12 |         551 |                13 |
+|  llama-2-13b-turbomind   |             2.52 |        5.04 |        0.00 |             29 |                29 |              761 |                    333 |        29 |           17 |         207 |               323 |         0 |           12 |         554 |                10 |
+|  llama-2-70b-turbomind   |             5.04 |        9.57 |        0.52 |             58 |                47 |              684 |                    363 |        55 |           28 |         140 |               353 |         3 |           19 |         544 |                10 |
+|   llama-3-8b-turbomind   |            16.59 |       16.70 |       16.49 |            191 |                30 |              236 |                    695 |        96 |           13 |         119 |               348 |        95 |           17 |         117 |               347 |
+|  llama-3-70b-turbomind   |            38.49 |       38.43 |       38.54 |            443 |                 2 |              120 |                    587 |       221 |            2 |          58 |               295 |       222 |            0 |          62 |               292 |
+| internlm2-1.8b-turbomind |             4.34 |        5.04 |        3.65 |             50 |                33 |              333 |                    736 |        29 |           18 |         177 |               352 |        21 |           15 |         156 |               384 |
+|  internlm2-7b-turbomind  |            12.16 |       12.52 |       11.81 |            140 |                41 |              166 |                    805 |        72 |           23 |          92 |               389 |        68 |           18 |          74 |               416 |
+| internlm2-20b-turbomind  |            18.46 |       20.96 |       15.97 |            213 |                54 |              134 |                    751 |       121 |           24 |          57 |               374 |        92 |           30 |          77 |               377 |
+|   qwen-1.8b-turbomind    |             1.82 |        1.91 |        1.74 |             21 |                31 |              449 |                    651 |        11 |           17 |         208 |               340 |        10 |           14 |         241 |               311 |
+|    qwen-7b-turbomind     |             4.95 |        5.39 |        4.51 |             57 |                37 |              388 |                    670 |        31 |           15 |         197 |               333 |        26 |           22 |         191 |               337 |
+|    qwen-14b-turbomind    |             8.86 |        9.74 |        7.99 |            102 |                 2 |              245 |                    803 |        56 |            0 |         120 |               400 |        46 |            2 |         125 |               403 |
+|    qwen-72b-turbomind    |            16.86 |       19.48 |       14.24 |            194 |                12 |              229 |                    717 |       112 |            4 |         112 |               348 |        82 |            8 |         117 |               369 |
+|     qwen1.5-0.5b-hf      |             0.87 |        0.52 |        1.22 |             10 |                29 |              499 |                    614 |         3 |           10 |         259 |               304 |         7 |           19 |         240 |               310 |
+|     qwen1.5-1.8b-hf      |             2.00 |        2.26 |        1.74 |             23 |                26 |              434 |                    669 |        13 |           10 |         220 |               333 |        10 |           16 |         214 |               336 |
+|      qwen1.5-4b-hf       |             5.65 |        6.96 |        4.34 |             65 |                37 |              349 |                    701 |        40 |           19 |         161 |               356 |        25 |           18 |         188 |               345 |
+|      qwen1.5-7b-hf       |             6.69 |        8.00 |        5.38 |             77 |                30 |              283 |                    762 |        46 |           12 |         124 |               394 |        31 |           18 |         159 |               368 |
+|      qwen1.5-14b-hf      |            12.69 |       13.74 |       11.63 |            146 |                43 |              232 |                    731 |        79 |           22 |         122 |               353 |        67 |           21 |         110 |               378 |
+|      qwen1.5-32b-hf      |            14.34 |       16.70 |       11.98 |            165 |                45 |              191 |                    751 |        96 |           18 |          88 |               374 |        69 |           27 |         103 |               377 |
+|      qwen1.5-72b-hf      |            15.29 |       15.65 |       14.93 |            176 |                11 |              242 |                    723 |        90 |            7 |         118 |               361 |        86 |            4 |         124 |               362 |
+|   qwen1.5-moe-a2-7b-hf   |             9.56 |       10.09 |        9.03 |            110 |                10 |              272 |                    760 |        58 |            5 |         129 |               384 |        52 |            5 |         143 |               376 |
+|    mistral-7b-v0.1-hf    |            11.38 |       11.83 |       10.94 |            131 |                30 |              221 |                    770 |        68 |           11 |         100 |               397 |        63 |           19 |         121 |               373 |
+|    mistral-7b-v0.2-hf    |            11.38 |       11.13 |       11.63 |            131 |                 2 |              259 |                    760 |        64 |            2 |         124 |               386 |        67 |            0 |         135 |               374 |
+|   mixtral-8x7b-v0.1-hf   |            21.11 |       21.39 |       20.83 |            243 |                 7 |              165 |                    737 |       123 |            4 |          76 |               373 |       120 |            3 |          89 |               364 |
+|  mixtral-8x22b-v0.1-hf   |            30.97 |       31.22 |       30.73 |            357 |                 6 |              131 |                    658 |       180 |            3 |          66 |               327 |       177 |            3 |          65 |               331 |
+|         yi-6b-hf         |             2.43 |        2.78 |        2.08 |             28 |                 7 |              456 |                    661 |        16 |            2 |         214 |               344 |        12 |            5 |         242 |               317 |
+|        yi-34b-hf         |             8.25 |        8.35 |        8.16 |             95 |                 8 |              319 |                    730 |        48 |            5 |         163 |               360 |        47 |            3 |         156 |               370 |
+|   deepseek-7b-base-hf    |             5.30 |        5.22 |        5.38 |             61 |                 7 |              325 |                    759 |        30 |            4 |         165 |               377 |        31 |            3 |         160 |               382 |
+|   deepseek-67b-base-hf   |            26.50 |       26.96 |       26.04 |            305 |                 9 |              202 |                    636 |       155 |            4 |         105 |               312 |       150 |            5 |          97 |               324 |
+
+## Chat Models
+
+|             model             |   lcbench/pass@1 |   en/pass@1 |   cn/pass@1 |   lcbench/pass |   lcbench/timeout |   lcbench/failed |   lcbench/wrong_answer |   en/pass |   en/timeout |   en/failed |   en/wrong_answer |   cn/pass |   cn/timeout |   cn/failed |   cn/wrong_answer |
+|:-----------------------------:|-----------------:|------------:|------------:|---------------:|------------------:|-----------------:|-----------------------:|----------:|-------------:|------------:|------------------:|----------:|-------------:|------------:|------------------:|
+|     qwen1.5-0.5b-chat-hf      |             0.00 |        0.00 |        0.00 |              0 |                 0 |             1152 |                      0 |         0 |            0 |         576 |                 0 |         0 |            0 |         576 |                 0 |
+|     qwen1.5-1.8b-chat-hf      |             1.65 |        1.57 |        1.74 |             19 |                 5 |              603 |                    525 |         9 |            2 |         298 |               267 |        10 |            3 |         305 |               258 |
+|      qwen1.5-4b-chat-hf       |             5.56 |        5.22 |        5.90 |             64 |                17 |              484 |                    587 |        30 |            8 |         242 |               296 |        34 |            9 |         242 |               291 |
+|      qwen1.5-7b-chat-hf       |             8.78 |        9.57 |        7.99 |            101 |                25 |              333 |                    693 |        55 |           12 |         151 |               358 |        46 |           13 |         182 |               335 |
+|      qwen1.5-14b-chat-hf      |            14.42 |       16.52 |       12.33 |            166 |                18 |              222 |                    746 |        95 |           10 |         110 |               361 |        71 |            8 |         112 |               385 |
+|      qwen1.5-32b-chat-hf      |            10.78 |       13.04 |        8.51 |            124 |                15 |              516 |                    497 |        75 |           10 |         195 |               296 |        49 |            5 |         321 |               201 |
+|      qwen1.5-72b-chat-hf      |            18.77 |       18.78 |       18.75 |            216 |                23 |              164 |                    749 |       108 |           12 |          89 |               367 |       108 |           11 |          75 |               382 |
+|     qwen1.5-110b-chat-hf      |            34.58 |       34.43 |       34.72 |            399 |                20 |              176 |                    557 |       199 |           12 |          85 |               280 |       200 |            8 |          91 |               277 |
+|    internlm2-chat-1.8b-hf     |             4.52 |        5.04 |        3.99 |             52 |                10 |              364 |                    726 |        29 |            4 |         172 |               371 |        23 |            6 |         192 |               355 |
+|  internlm2-chat-1.8b-sft-hf   |             3.56 |        3.83 |        3.30 |             41 |                12 |              403 |                    696 |        22 |            6 |         211 |               337 |        19 |            6 |         192 |               359 |
+|     internlm2-chat-7b-hf      |            14.60 |       13.74 |       15.45 |            168 |                12 |              238 |                    734 |        79 |            7 |         142 |               348 |        89 |            5 |          96 |               386 |
+|   internlm2-chat-7b-sft-hf    |            14.34 |       14.61 |       14.06 |            165 |                 9 |              275 |                    703 |        84 |            3 |         174 |               315 |        81 |            6 |         101 |               388 |
+|     internlm2-chat-20b-hf     |            19.64 |       20.00 |       19.27 |            226 |                11 |              191 |                    724 |       115 |            7 |          83 |               371 |       111 |            4 |         108 |               353 |
+|   internlm2-chat-20b-sft-hf   |            20.55 |       19.91 |       21.18 |            237 |                11 |              195 |                    709 |       115 |            6 |          94 |               361 |       122 |            5 |         101 |               348 |
+|    llama-3-8b-instruct-hf     |            28.50 |       29.04 |       27.95 |            328 |                17 |               95 |                    712 |       167 |            7 |          44 |               358 |       161 |           10 |          51 |               354 |
+|    llama-3-70b-instruct-hf    |            45.44 |       46.09 |       44.79 |            523 |                 8 |               52 |                    569 |       265 |            2 |          25 |               284 |       258 |            6 |          27 |               285 |
+| llama-3-8b-instruct-lmdeploy  |            29.02 |       29.39 |       28.65 |            334 |                19 |               94 |                    705 |       169 |           11 |          42 |               354 |       165 |            8 |          52 |               351 |
+| llama-3-70b-instruct-lmdeploy |            44.66 |       46.78 |       42.53 |            514 |                11 |               44 |                    583 |       269 |            5 |          19 |               283 |       245 |            6 |          25 |               300 |
+|  mistral-7b-instruct-v0.1-hf  |             9.82 |       10.78 |        8.85 |            113 |                17 |              316 |                    706 |        62 |            9 |         152 |               353 |        51 |            8 |         164 |               353 |
+|  mistral-7b-instruct-v0.2-hf  |             7.90 |        6.26 |        9.55 |             91 |                 8 |              572 |                    481 |        36 |            4 |         345 |               191 |        55 |            4 |         227 |               290 |
+| mixtral-8x7b-instruct-v0.1-hf |            16.29 |       15.91 |       16.67 |            188 |                13 |              370 |                    581 |        92 |            6 |         241 |               237 |        96 |            7 |         129 |               344 |
--- a/configs/datasets/LCBench/lcbench_gen.py
+++ b/configs/datasets/LCBench/lcbench_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .lcbench_gen_5ff288 import LCBench_datasets  # noqa: F401, F403
--- a/configs/datasets/LCBench/lcbench_gen_5ff288.py
+++ b/configs/datasets/LCBench/lcbench_gen_5ff288.py
@ -0,0 +1,107 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import LCDataset, LCPassKEvaluator
+
+LC_reader_cfg = dict(
+    input_columns=['text', 'test_list'], output_column='test_column')
+
+
+LC_en_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt="You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with [\"a\",\"b\",\"c\"], we need to push the key one time to type \"a\", two times to type \"b\", and three times to type \"c\" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n'
+                ),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+
+LC_cn_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='你是一名专业的 Python 程序员，下面是你的任务： 给你三个 正整数 n 、x 和 y 。\n在城市中，存在编号从 1 到 n 的房屋，由 n 条街道相连。对所有 1 <= i < n ，都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k（1 <= k <= n），你需要找出所有满足要求的 房屋对 [house1, house2] ，即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ，其中 result[k] 表示所有满足要求的房屋对的数量，即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意，x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt="你是一名专业的 Python 程序员，下面是你的任务: 给你一个字符串 word，由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射，可以通过按压按键来组成单词。例如，按键 2 对应 [\"a\",\"b\",\"c\"]，我们需要按一次键来输入 \"a\"，按两次键来输入 \"b\"，按三次键来输入 \"c\"。\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母，但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1，*，# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt='你是一名专业的 Python 程序员，下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ，满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意：\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号，比方说如果 s == 11100 ，那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt='你是一名专业的 Python 程序员，下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list}  \n'
+                ),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+
+LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
+
+LCBench_datasets = [
+    dict(
+        type=LCDataset,
+        abbr='lcbench_en',
+        path='./data/LCBench2023/LCBench2023.jsonl',
+        num_repeats=1,
+        reader_cfg=LC_reader_cfg,
+        infer_cfg=LC_en_infer_cfg,
+        eval_cfg=LC_eval_cfg),
+    dict(
+        type=LCDataset,
+        abbr='lcbench_cn',
+        path='./data/LCBench2023/LCBench2023_cn.jsonl',
+        num_repeats=1,
+        reader_cfg=LC_reader_cfg,
+        infer_cfg=LC_cn_infer_cfg,
+        eval_cfg=LC_eval_cfg)
+]
--- a/configs/datasets/LCBench/lcbench_levels_gen_bb665f.py
+++ b/configs/datasets/LCBench/lcbench_levels_gen_bb665f.py
@ -0,0 +1,77 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import LCDataset, LCPassKEvaluator
+
+LC_difficulties_list = ['EASY', 'MEDIUM', 'HARD']
+LC_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column')
+
+
+LC_en_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with ["a","b","c"], we need to push the key one time to type "a", two times to type "b", and three times to type "c" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"),
+                dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+
+LC_cn_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务： 给你三个 正整数 n 、x 和 y 。\n在城市中，存在编号从 1 到 n 的房屋，由 n 条街道相连。对所有 1 <= i < n ，都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k（1 <= k <= n），你需要找出所有满足要求的 房屋对 [house1, house2] ，即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ，其中 result[k] 表示所有满足要求的房屋对的数量，即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意，x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: 给你一个字符串 word，由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射，可以通过按压按键来组成单词。例如，按键 2 对应 ["a","b","c"]，我们需要按一次键来输入 "a"，按两次键来输入 "b"，按三次键来输入 "c"。\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母，但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1，*，# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ，满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意：\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号，比方说如果 s == 11100 ，那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+
+LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
+
+LCBench_datasets = []
+for difficulty in LC_difficulties_list:
+    LCBench_datasets.append(
+        dict(
+            type=LCDataset,
+            abbr='lcbench_en-' + difficulty,
+            path='data/LCBench2023/LCBench2023.jsonl',
+            difficulty=difficulty,
+            reader_cfg=LC_reader_cfg,
+            infer_cfg=LC_en_infer_cfg,
+            eval_cfg=LC_eval_cfg,
+        )
+    )
+    LCBench_datasets.append(
+        dict(
+            type=LCDataset,
+            abbr='lcbench_cn-' + difficulty,
+            path='data/LCBench2023/LCBench2023_cn.jsonl',
+            difficulty=difficulty,
+            reader_cfg=LC_reader_cfg,
+            infer_cfg=LC_cn_infer_cfg,
+            eval_cfg=LC_eval_cfg,
+        )
+    )
--- a/configs/datasets/LCBench/lcbench_repeat10_gen.py
+++ b/configs/datasets/LCBench/lcbench_repeat10_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .lcbench_repeat10_gen_5ff288 import LCBench_datasets_repeat10  # noqa: F401, F403
--- a/configs/datasets/LCBench/lcbench_repeat10_gen_5ff288.py
+++ b/configs/datasets/LCBench/lcbench_repeat10_gen_5ff288.py
@ -0,0 +1,106 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import LCDataset, LCPassKEvaluator
+
+LC_reader_cfg = dict(
+    input_columns=['text', 'test_list'], output_column='test_column')
+
+
+LC_en_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt="You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with [\"a\",\"b\",\"c\"], we need to push the key one time to type \"a\", two times to type \"b\", and three times to type \"c\" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n'
+                ),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+
+LC_cn_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='你是一名专业的 Python 程序员，下面是你的任务： 给你三个 正整数 n 、x 和 y 。\n在城市中，存在编号从 1 到 n 的房屋，由 n 条街道相连。对所有 1 <= i < n ，都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k（1 <= k <= n），你需要找出所有满足要求的 房屋对 [house1, house2] ，即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ，其中 result[k] 表示所有满足要求的房屋对的数量，即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意，x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt="你是一名专业的 Python 程序员，下面是你的任务: 给你一个字符串 word，由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射，可以通过按压按键来组成单词。例如，按键 2 对应 [\"a\",\"b\",\"c\"]，我们需要按一次键来输入 \"a\"，按两次键来输入 \"b\"，按三次键来输入 \"c\"。\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母，但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1，*，# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt='你是一名专业的 Python 程序员，下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ，满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意：\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号，比方说如果 s == 11100 ，那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'
+                ),
+                dict(
+                    role='BOT',
+                    prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "
+                ),
+                dict(
+                    role='HUMAN',
+                    prompt='你是一名专业的 Python 程序员，下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list}  \n'
+                ),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
+
+LCBench_datasets_repeat10 = [
+    dict(
+        type=LCDataset,
+        abbr='lcbench_en_repeat10',
+        path='./data/LCBench2023/LCBench2023.jsonl',
+        num_repeats=10,
+        reader_cfg=LC_reader_cfg,
+        infer_cfg=LC_en_infer_cfg,
+        eval_cfg=LC_eval_cfg),
+    dict(
+        type=LCDataset,
+        abbr='lcbench_cn_repeat10',
+        path='./data/LCBench2023/LCBench2023_cn.jsonl',
+        num_repeats=10,
+        reader_cfg=LC_reader_cfg,
+        infer_cfg=LC_cn_infer_cfg,
+        eval_cfg=LC_eval_cfg)
+]
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py
@ -0,0 +1,55 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BoolQDatasetV2
+from opencompass.utils.text_postprocessors import (
+    first_option_postprocess,
+)
+
+QUERY_TEMPLATE = """
+Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering.
+
+Passage: {passage}
+
+Question: {question}
+
+A. Yes
+B. NO
+
+""".strip()
+
+BoolQ_reader_cfg = dict(
+    input_columns=['question', 'passage'],
+    output_column='label',
+)
+
+BoolQ_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+BoolQ_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
+)
+
+BoolQ_datasets = [
+    dict(
+        abbr='BoolQ',
+        type=BoolQDatasetV2,
+        path='opencompass/boolq',
+        reader_cfg=BoolQ_reader_cfg,
+        infer_cfg=BoolQ_infer_cfg,
+        eval_cfg=BoolQ_eval_cfg,
+    )
+]
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py
@ -0,0 +1,47 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BoolQDatasetV2
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+BoolQ_reader_cfg = dict(
+    input_columns=['question', 'passage'],
+    output_column='label',
+)
+
+BoolQ_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin='</E>',
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:',
+                ),
+                dict(role='BOT', prompt='{label}'),
+            ],
+        ),
+        ice_token='</E>',
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
+    inferencer=dict(type=GenInferencer, max_out_len=50),
+)
+
+BoolQ_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+
+BoolQ_datasets = [
+    dict(
+        abbr='BoolQ',
+        type=BoolQDatasetV2,
+        path='opencompass/boolq',
+        reader_cfg=BoolQ_reader_cfg,
+        infer_cfg=BoolQ_infer_cfg,
+        eval_cfg=BoolQ_eval_cfg,
+    )
+]
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py
@ -33,7 +33,7 @@ BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV2,
-        path='./data/SuperGLUE/BoolQ/val.jsonl',
+        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py
@ -0,0 +1,43 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BoolQDatasetV2
+
+BoolQ_reader_cfg = dict(
+    input_columns=['question', 'passage'],
+    output_column='label',
+)
+
+BoolQ_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'A':
+            dict(round=[
+                dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
+                dict(role='BOT', prompt='Yes'),
+            ]),
+            'B':
+            dict(round=[
+                dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
+                dict(role='BOT', prompt='No'),
+            ]),
+        },
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer),
+)
+
+BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+BoolQ_datasets = [
+    dict(
+        abbr='BoolQ',
+        type=BoolQDatasetV2,
+        path='opencompass/boolq',
+        reader_cfg=BoolQ_reader_cfg,
+        infer_cfg=BoolQ_infer_cfg,
+        eval_cfg=BoolQ_eval_cfg,
+    )
+]
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py
@ -35,7 +35,7 @@ BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV3,
-        path='./data/SuperGLUE/BoolQ/val.jsonl',
+        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py
@ -36,7 +36,7 @@ BoolQ_datasets = [
        type=BoolQDataset,
        abbr='BoolQ',
        path='json',
-        data_files='./data/SuperGLUE/BoolQ/val.jsonl',
+        data_files='opencompass/boolq',
        split='train',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py
@ -36,7 +36,7 @@ BoolQ_datasets = [
        type=BoolQDataset,
        abbr='BoolQ',
        path='json',
-        data_files='./data/SuperGLUE/BoolQ/val.jsonl',
+        data_files='opencompass/boolq',
        split='train',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py
@ -26,7 +26,7 @@ BoolQ_datasets = [
        type=BoolQDataset,
        abbr='BoolQ',
        path='json',
-        data_files='./data/SuperGLUE/BoolQ/val.jsonl',
+        data_files='opencompass/boolq',
        split='train',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
--- a/configs/datasets/bbh/bbh_gen_4a31fa.py
+++ b/configs/datasets/bbh/bbh_gen_4a31fa.py
@ -0,0 +1,99 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step. And you must give your final answer by starting with 'So the answer is' "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step. And you must give your final answer by starting with 'So the answer is' "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
--- a/configs/datasets/calm/README.md
+++ b/configs/datasets/calm/README.md
@ -0,0 +1,117 @@
+# CaLM Lite
+**CaLM Lite** is a lightweight version of CaLM.
+
+**Ca**usal evaluation of **L**anguage **M**odels (CaLM), to the best of our knowledge, is the first comprehensive benchmark for evaluating the causal reasoning capabilities of language models. The CaLM framework establishes a foundational taxonomy consisting of four modules: causal target (i.e., what to evaluate), adaptation (i.e., how to obtain the results), metric (i.e., how to measure the results), and error (i.e., how to analyze the bad results).
+
+<div align="center">
+
+[🌐 Website](https://opencausalab.github.io/CaLM) |
+[📃 Report](https://arxiv.org/abs/2405.00622) |[ 🎆 Github](https://github.com/OpenCausaLab/CaLM) | 📧 Welcome to join us by email at causalai@pjlab.org.cn
+</div>
+
+## Quick Start
+### Data Preparation
+Download dataset to data/ folder.
+```
+wget https://github.com/OpenCausaLab/CaLM/releases/download/v1.0.0.lite/calm.zip
+unzip calm.zip
+```
+### Run Model and Infer
+To obtain a concise output with only the average information for all tasks, use:
+
+```
+python run.py --models YOUR_MODEL --datasets calm --summarizer calm
+```
+
+If you want detailed information for each task, use:
+
+```
+python run.py --models YOUR_MODEL --datasets calm
+```
+
+The `--summarizer calm` flag in the first command is used to generate a summarized output, while omitting it in the second command will provide task-specific details.
+## Available Causal Tasks
+We provide 92 tasks for causal evaluation, stored in the `data/calm` folder. For more information about our causal tasks, refer to [tasks](https://github.com/OpenCausaLab/CaLM/blob/main/documents/tasks.md).
+The directory structure is:
+
+```
+├── calm
+| ├── association
+| ├── causal_discovery # Rung of the causal ladder
+| │ ├── abstract_reasoning # Causal scenario
+| │ │ ├── AR-B_CaLM-AR_CN.json # Causal task
+| │ | └── AR-B_CaLM-AR_EN.json # Causal task
+| │ └── ...
+| └── ...
+└── ...
+```
+
+## Dataset
+- **Dataset size**: CaLM Lite leverages a light dataset of **9200**, while CaLM uses a significantly larger dataset of 126,334. The table below details the English dataset composition, with the Chinese version structured identically.
+- **Dataset configuration**: We prioritize balance in our dataset for **binary classification** and **choice selection** questions. By ensuring an equal number of each GT label, we minimize the risk of introducing bias into the model's testing. For **probability calculation**, CaLM-Lite takes extra attention to balance the number of problems across different causal reasoning processes. (For more details on how causal reasoning process is defined, please refer to Section 9.1.6 of the [paper](https://arxiv.org/abs/2405.00622).)
+- **Efficient evaluation**: For enhanced evaluation efficiency, OpenCompass offers customizable methods. Refer to the [documentation](https://opencompass.org.cn/doc) for guidance on tailoring these methods to your needs.
+
+| Causal ladder | Causal scenario | Subset | Question type | Mode | CaLM Lite | CaLM |
+|---------------|-----------------|--------|---------------|------|-----------|------|
+| Causal discovery | PCD | E-CARE | Binary classification | Natural | 100 | 2000 |
+| Causal discovery | PCD | E-CARE | Choice selection | Natural | 100 | 1000 |
+| Causal discovery | PCD | COPA | Binary classification | Natural | 100 | 2000 |
+| Causal discovery | PCD | COPA | Choice selection | Natural | 100 | 1000 |
+| Causal discovery | ECI | CTB | Binary classification | Natural | 100 | 596 |
+| Causal discovery | ECI | ESC | Binary classification | Natural | 100 | 1000 |
+| Causal discovery | ECI | MAVEN-ERE | Binary classification | Natural | 100 | 1000 |
+| Causal discovery | AR | CaLM-AR | Binary classification | Symbolic | 100 | 1600 |
+| Causal discovery | CA | FP | Binary classification | Symbolic | 100 | 1600 |
+| Causal discovery | CA | FA | Binary classification | Symbolic | 100 | 1600 |
+| Association | CORR | correlation | Binary classification | Natural | 100 | 1476 |
+| Association | EAE | exp-away | Binary classification | Natural | 100 | 168 |
+| Intervention | CB | collider-bias | Binary classification | Natural | 100 | 163 |
+| Intervention | ATE | ATE-natural | Binary classification | Natural | 100 | 1600 |
+| Intervention | ATE | ATE-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Intervention | ATE | ATE-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Intervention | CDE | CDE-natural | Binary classification | Natural | 100 | 1600 |
+| Intervention | CDE | CDE-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Intervention | CDE | CDE-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Intervention | BAS | backadj | Binary classification | Natural | 100 | 227 |
+| Intervention | BAS | max-BAS | Choice selection | Symbolic | 100 | 1600 |
+| Intervention | BAS | min-BAS | Choice selection | Symbolic | 100 | 1600 |
+| Intervention | BAS | mix-BAS | Choice selection | Symbolic | 100 | 1600 |
+| Intervention | FAS | FAS | Choice selection | Symbolic | 100 | 1600 |
+| Intervention | IV | CaLM-IV | Choice selection | Symbolic | 100 | 1600 |
+| Intervention | CEI | 0.2-UC | Binary classification | Symbolic | 100 | 1600 |
+| Intervention | CEI | 0.4-UC | Binary classification | Symbolic | 100 | 1600 |
+| Intervention | CEI | 0.6-UC | Binary classification | Symbolic | 100 | 1600 |
+| Intervention | CEI | 0.8-UC | Binary classification | Symbolic | 100 | 1600 |
+| Counterfactuals | ETT | ETT-natural | Binary classification | Natural | 100 | 1600 |
+| Counterfactuals | ETT | ETT-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | ETT | ETT-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | NDE | NDE-natural | Binary classification | Natural | 100 | 1600 |
+| Counterfactuals | NDE | NDE-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | NDE | NDE-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | NIE | NIE-natural | Binary classification | Natural | 100 | 1600 |
+| Counterfactuals | NIE | NIE-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | NIE | NIE-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | PN | PN-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | PN | PN-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | PS | PS-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | PS | PS-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | AC | causal judgement | Binary classification | Natural | 100 | 187 |
+| Counterfactuals | CR | CRASS | Choice selection | Natural | 100 | 274 |
+| Counterfactuals | CR | det-counterfactual | Binary classification | Natural | 100 | 1476 |
+| Counterfactuals | CEG | E-CARE | Open-ended generation | Natural | 100 | 1000 |
+| **Total** | | | | | 4600 | 63167 |
+
+## Available Prompt Styles (Adaptation)
+Basic Prompt is our default setting for efficient evaluation of CaLM Lite, but we provide flexibility for exploring additional prompts through CaLM. If you'd like to explore and compare a wider range of prompts, we encourage you to use CaLM. We provide a comprehensive and easy-to-follow guide to assist you in our [repository](https://github.com/OpenCausaLab/CaLM).
+
+## Citation
+```
+@misc{chen2024causal,
+      title={Causal Evaluation of Language Models},
+      author={Sirui Chen and Bo Peng and Meiqi Chen and Ruiqi Wang and Mengying Xu and Xingyu Zeng and Rui Zhao and Shengjie Zhao and Yu Qiao and Chaochao Lu},
+      year={2024},
+      eprint={2405.00622},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
--- a/configs/datasets/calm/calm.py
+++ b/configs/datasets/calm/calm.py
@ -0,0 +1,160 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CaLMDataset, CaLMEvaluator
+
+task_hiearchy_dict = {
+    # association/
+        # correlation/
+            'CORR-B_correlation_CN':'association/correlation/',
+            'CORR-B_correlation_EN':'association/correlation/',
+        # explaining_away_effect/
+            'EAE-B_exp-away_CN':'association/explaining_away_effect/',
+            'EAE-B_exp-away_EN':'association/explaining_away_effect/',
+    # causal_discovery/
+        # abstract_reasoning/
+            'AR-B_CaLM-AR_CN':'causal_discovery/abstract_reasoning/',
+            'AR-B_CaLM-AR_EN':'causal_discovery/abstract_reasoning/',
+        # causal_attribution/
+            'CA-B_FA_CN':'causal_discovery/causal_attribution/',
+            'CA-B_FA_EN':'causal_discovery/causal_attribution/',
+            'CA-B_FP_CN':'causal_discovery/causal_attribution/',
+            'CA-B_FP_EN':'causal_discovery/causal_attribution/',
+        # event_causality_identification/
+            'ECI-B_CTB_CN':'causal_discovery/event_causality_identification/',
+            'ECI-B_CTB_EN':'causal_discovery/event_causality_identification/',
+            'ECI-B_ESC_CN':'causal_discovery/event_causality_identification/',
+            'ECI-B_ESC_EN':'causal_discovery/event_causality_identification/',
+            'ECI-B_MAVEN-ERE_CN':'causal_discovery/event_causality_identification/',
+            'ECI-B_MAVEN-ERE_EN':'causal_discovery/event_causality_identification/',
+        # pairwise_causal_discovery/
+            'PCD-B_COPA_CN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-B_COPA_EN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-B_E-CARE_CN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-B_E-CARE_EN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-C_COPA_CN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-C_COPA_EN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-C_E-CARE_CN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-C_E-CARE_EN':'causal_discovery/pairwise_causal_discovery/',
+    # counterfactual/
+        # actual_causality/
+            'AC-B_causal_judgement_CN':'counterfactual/actual_causality/',
+            'AC-B_causal_judgement_EN':'counterfactual/actual_causality/',
+        # causal_explanation_generation/
+            'CEG-O_E-CARE_CN':'counterfactual/causal_explanation_generation/',
+            'CEG-O_E-CARE_EN':'counterfactual/causal_explanation_generation/',
+        # counterfactual_reasoning/
+            'CR-B_det-counterfactual_CN':'counterfactual/counterfactual_reasoning/',
+            'CR-B_det-counterfactual_EN':'counterfactual/counterfactual_reasoning/',
+            'CR-C_CRASS_CN':'counterfactual/counterfactual_reasoning/',
+            'CR-C_CRASS_EN':'counterfactual/counterfactual_reasoning/',
+        # effect_of_the_treatment_on_the_treated/
+            'ETT-B_ETT-natural_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+            'ETT-B_ETT-natural_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+            'ETT-P_ETT-basic_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+            'ETT-P_ETT-basic_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+            'ETT-P_ETT-hard_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+            'ETT-P_ETT-hard_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+        # natural_direct_effect/
+            'NDE-B_NDE-natural_CN':'counterfactual/natural_direct_effect/',
+            'NDE-B_NDE-natural_EN':'counterfactual/natural_direct_effect/',
+            'NDE-P_NDE-basic_CN':'counterfactual/natural_direct_effect/',
+            'NDE-P_NDE-basic_EN':'counterfactual/natural_direct_effect/',
+            'NDE-P_NDE-hard_CN':'counterfactual/natural_direct_effect/',
+            'NDE-P_NDE-hard_EN':'counterfactual/natural_direct_effect/',
+        # natural_indirect_effect/
+            'NIE-B_NIE-natural_CN':'counterfactual/natural_indirect_effect/',
+            'NIE-B_NIE-natural_EN':'counterfactual/natural_indirect_effect/',
+            'NIE-P_NIE-basic_CN':'counterfactual/natural_indirect_effect/',
+            'NIE-P_NIE-basic_EN':'counterfactual/natural_indirect_effect/',
+            'NIE-P_NIE-hard_CN':'counterfactual/natural_indirect_effect/',
+            'NIE-P_NIE-hard_EN':'counterfactual/natural_indirect_effect/',
+        # probability_of_necessity/
+            'PN-P_PN-basic_CN':'counterfactual/probability_of_necessity/',
+            'PN-P_PN-basic_EN':'counterfactual/probability_of_necessity/',
+            'PN-P_PN-hard_CN':'counterfactual/probability_of_necessity/',
+            'PN-P_PN-hard_EN':'counterfactual/probability_of_necessity/',
+        # probability_of_sufficiency/
+            'PS-P_PS-basic_CN':'counterfactual/probability_of_sufficiency/',
+            'PS-P_PS-basic_EN':'counterfactual/probability_of_sufficiency/',
+            'PS-P_PS-hard_CN':'counterfactual/probability_of_sufficiency/',
+            'PS-P_PS-hard_EN':'counterfactual/probability_of_sufficiency/',
+    # intervention/
+        # average_treatment_effect/
+            'ATE-B_ATE-natural_CN':'intervention/average_treatment_effect/',
+            'ATE-B_ATE-natural_EN':'intervention/average_treatment_effect/',
+            'ATE-P_ATE-basic_CN':'intervention/average_treatment_effect/',
+            'ATE-P_ATE-basic_EN':'intervention/average_treatment_effect/',
+            'ATE-P_ATE-hard_CN':'intervention/average_treatment_effect/',
+            'ATE-P_ATE-hard_EN':'intervention/average_treatment_effect/',
+        # backdoor_adjustment_set/
+            'BAS-B_backadj_CN':'intervention/backdoor_adjustment_set/',
+            'BAS-B_backadj_EN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_max-BAS_CN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_max-BAS_EN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_min-BAS_CN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_min-BAS_EN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_mix-BAS_CN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_mix-BAS_EN':'intervention/backdoor_adjustment_set/',
+        # causal_effect_identification/
+            'CEI-B_0.2-UC_CN':'intervention/causal_effect_identification/',
+            'CEI-B_0.2-UC_EN':'intervention/causal_effect_identification/',
+            'CEI-B_0.4-UC_CN':'intervention/causal_effect_identification/',
+            'CEI-B_0.4-UC_EN':'intervention/causal_effect_identification/',
+            'CEI-B_0.6-UC_CN':'intervention/causal_effect_identification/',
+            'CEI-B_0.6-UC_EN':'intervention/causal_effect_identification/',
+            'CEI-B_0.8-UC_CN':'intervention/causal_effect_identification/',
+            'CEI-B_0.8-UC_EN':'intervention/causal_effect_identification/',
+        # collider_bias/
+            'CB-B_collider-bias_CN':'intervention/collider_bias/',
+            'CB-B_collider-bias_EN':'intervention/collider_bias/',
+        # controlled_direct_effect/
+            'CDE-B_CDE-natural_CN':'intervention/controlled_direct_effect/',
+            'CDE-B_CDE-natural_EN':'intervention/controlled_direct_effect/',
+            'CDE-P_CDE-basic_CN':'intervention/controlled_direct_effect/',
+            'CDE-P_CDE-basic_EN':'intervention/controlled_direct_effect/',
+            'CDE-P_CDE-hard_CN':'intervention/controlled_direct_effect/',
+            'CDE-P_CDE-hard_EN':'intervention/controlled_direct_effect/',
+        # frontdoor_adjustment_set/
+            'FAS-C_FAS_CN':'intervention/frontdoor_adjustment_set/',
+            'FAS-C_FAS_EN':'intervention/frontdoor_adjustment_set/',
+        # instrumental_variable/
+            'IV-C_CaLM-IV_CN':'intervention/instrumental_variable/',
+            'IV-C_CaLM-IV_EN':'intervention/instrumental_variable/',}
+
+calm_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='gt_item')
+
+calm_all_sets = list(set(key[:-3] for key in task_hiearchy_dict.keys()))
+
+calm_datasets = []
+for _name in calm_all_sets:
+    for _prompt_style in ['basic','basic-CN']:
+        _task_name = _name + ('_CN' if _prompt_style.endswith('-CN') else '_EN')
+        _path = f'./data/calm/{task_hiearchy_dict[_task_name]}{_task_name}.json'
+
+        calm_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template='{question}'),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=500))
+
+        calm_eval_cfg = dict(evaluator=dict(
+                type=CaLMEvaluator,
+                core_metrics=True,
+                error_analysis=True,
+                prompt_style=_prompt_style,
+                task=_task_name))
+        calm_datasets.append(
+            dict(
+                abbr=f'calm_{_task_name}',
+                type=CaLMDataset,
+                path=_path,
+                prompt_style=_prompt_style,
+                reader_cfg=calm_reader_cfg,
+                infer_cfg=calm_infer_cfg,
+                eval_cfg=calm_eval_cfg)
+        )
+del _prompt_style, _task_name, _path, _name
--- a/configs/datasets/ceval/ceval_internal_ppl_93e5ce.py
+++ b/configs/datasets/ceval/ceval_internal_ppl_93e5ce.py
@ -0,0 +1,108 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import CEvalDataset
+
+ceval_subject_mapping = {
+    'computer_network': ['Computer Network', '计算机网络', 'STEM'],
+    'operating_system': ['Operating System', '操作系统', 'STEM'],
+    'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
+    'college_programming': ['College Programming', '大学编程', 'STEM'],
+    'college_physics': ['College Physics', '大学物理', 'STEM'],
+    'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
+    'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
+    'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
+    'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
+    'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
+    'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
+    'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
+    'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
+    'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
+    'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
+    'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
+    'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
+    'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
+    'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
+    'college_economics': ['College Economics', '大学经济学', 'Social Science'],
+    'business_administration': ['Business Administration', '工商管理', 'Social Science'],
+    'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
+    'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
+    'education_science': ['Education Science', '教育学', 'Social Science'],
+    'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
+    'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
+    'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
+    'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
+    'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
+    'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
+    'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
+    'logic': ['Logic', '逻辑学', 'Humanities'],
+    'law': ['Law', '法学', 'Humanities'],
+    'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
+    'art_studies': ['Art Studies', '艺术学', 'Humanities'],
+    'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
+    'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
+    'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
+    'high_school_history': ['High School History', '高中历史', 'Humanities'],
+    'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
+    'civil_servant': ['Civil Servant', '公务员', 'Other'],
+    'sports_science': ['Sports Science', '体育学', 'Other'],
+    'plant_protection': ['Plant Protection', '植物保护', 'Other'],
+    'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
+    'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
+    'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
+    'accountant': ['Accountant', '注册会计师', 'Other'],
+    'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
+    'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
+    'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
+    'physician': ['Physician', '医师资格', 'Other'],
+}
+ceval_all_sets = list(ceval_subject_mapping.keys())
+
+ceval_datasets = []
+for _split in ['val', 'test']:
+    for _name in ceval_all_sets:
+        _ch_name = ceval_subject_mapping[_name][1]
+        ceval_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template={
+                    answer: dict(
+                        begin='</E>',
+                        round=[
+                            dict(
+                                role='HUMAN',
+                                prompt=
+                                f'以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: '
+                            ),
+                            dict(role='BOT', prompt=answer),
+                        ])
+                    for answer in ['A', 'B', 'C', 'D']
+                },
+                ice_token='</E>',
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+            inferencer=dict(type=PPLInferencer),
+        )
+
+        ceval_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
+
+        ceval_datasets.append(
+            dict(
+                type=CEvalDataset,
+                path='./data/ceval_internal/formal_ceval',
+                local_mode=True,
+                name=_name,
+                abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' +
+                _name,
+                reader_cfg=dict(
+                    input_columns=['question', 'A', 'B', 'C', 'D'],
+                    output_column='answer',
+                    train_split='dev',
+                    test_split=_split),
+                infer_cfg=ceval_infer_cfg,
+                eval_cfg=ceval_eval_cfg,
+            ))
+
+del _split, _name, _ch_name
--- a/configs/datasets/cmmlu/cmmlu_0shot_cot_gen_305931.py
+++ b/configs/datasets/cmmlu/cmmlu_0shot_cot_gen_305931.py
@ -0,0 +1,130 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教'
+}
+
+QUERY_TEMPLATE = """
+你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一. 请在回答之前一步步思考.
+
+{question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
+    cmmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    cmmlu_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(
+            type=match_answer_pattern,
+            # answer_pattern=r'(?i)答案\s*:\s*([A-D])'
+            answer_pattern=r'(?i)答案\s*:\s*[\W]*([A-D])[\W]*',
+        )
+    )
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+        ))
+
+del _name, _ch_name
--- a/configs/datasets/commonsenseqa/commonsenseqa_7shot_cot_gen_734a22.py
+++ b/configs/datasets/commonsenseqa/commonsenseqa_7shot_cot_gen_734a22.py
@ -0,0 +1,115 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import commonsenseqaDataset
+from opencompass.utils.text_postprocessors import (
+    match_answer_pattern,
+)
+
+commonsenseqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
+    output_column='answerKey',
+    test_split='validation',
+)
+
+_ice_template = dict(
+    type=PromptTemplate,
+    template=dict(
+        begin='</E>',
+        round=[
+            dict(
+                role='HUMAN',
+                prompt='Q: What do people use to absorb extra ink from a fountain pen? Answer Choices: A.shirt pocket B.calligrapher’s hand C.inkwell D.desk drawer E.blotter',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer must be an item that can absorb ink. Of the above choices, only blotters are used to absorb ink. So the answer is E.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: What home entertainment equipment requires cable?Answer Choices: A.radio shack B.substation C.television D.cabinet',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer must require cable. Of the above choices, only television requires cable. So the answer is C.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: The fox walked from the city into the forest, what was it looking for? Answer Choices: A.pretty flowers B.hen house C.natural habitat D.storybook',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer must be something in the forest. Of the above choices, only natural habitat is in the forest. So the answer is B.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: Sammy wanted to go to where the people were. Where might he go? Answer Choices: A.populated areas B.race track C.desert D.apartment E.roadblock',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer must be a place with a lot of people. Of the above choices, only populated areas have a lot of people. So the answer is A.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: Where do you put your grapes just before checking out? Answer Choices: A.mouth B.grocery cart Csuper market D.fruit basket E.fruit market',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer should be the place where grocery items are placed before checking out. Of the above choices, grocery cart makes the most sense for holding grocery items. So the answer is B.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: Google Maps and other highway and street GPS services have replaced what? Answer Choices: A.united states B.mexico C.countryside D.atlas',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer must be something that used to do what Google Maps and GPS services do, which is to give directions. Of the above choices, only atlases are used to give directions. So the answer is D.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: Before getting a divorce, what did the wife feel who was doing all the work? Answer Choices: A.harder B.anguish C.bitterness D.tears E.sadness',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer should be the feeling of someone getting divorced who was doing all the work. Of the above choices, the closest feeling is bitterness. So the answer is C.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q:{question}  Answer Choices: A. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nA:',
+            ),
+            dict(
+                role='BOT',
+                prompt='{answerKey}',
+            ),
+        ],
+    ),
+    ice_token='</E>',
+)
+
+commonsenseqa_infer_cfg = dict(
+    ice_template=_ice_template,
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+commonsenseqa_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(
+        type=match_answer_pattern, answer_pattern=r'(?i)so the answer is\s*([A-P])'
+    ),
+)
+
+
+commonsenseqa_datasets = [
+    dict(
+        abbr='commonsense_qa',
+        type=commonsenseqaDataset,
+        path='opencompass/commonsense_qa',
+        reader_cfg=commonsenseqa_reader_cfg,
+        infer_cfg=commonsenseqa_infer_cfg,
+        eval_cfg=commonsenseqa_eval_cfg,
+    )
+]
+
+del _ice_template
--- a/configs/datasets/compassbench_v1_3/compassbench_v1_3_code_gen_c8c3aa.py
+++ b/configs/datasets/compassbench_v1_3/compassbench_v1_3_code_gen_c8c3aa.py
@ -0,0 +1,181 @@
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, HumanEvalPlusEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import LCDataset, LCPassKEvaluator
+from opencompass.datasets import TACODataset, TACOEvaluator
+
+compassbench_v1_3_code_datasets = []
+
+# --------------------------------------------------------------- HumanEval CN ---------------------------------------------------------------
+humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
+
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='完成以下Python代码任务:\n{prompt}'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+compassbench_v1_3_code_datasets.append(
+    dict(
+        abbr='compass_bench_cdoe_completion_zh',
+        type=HumanevalDataset,
+        path='./data/compassbench_v1_3/coding/compass_bench_cdoe_completion/compass_bench_cdoe_completion_zh.jsonl',
+        # local_mode=True,
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg,
+    )
+)
+# ---------------------------------------------------------------  ---------------------------------------------------------------
+humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_plus_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Complete the following python code:\n{prompt}'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+humaneval_plus_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalPlusEvaluator),
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+compassbench_v1_3_code_datasets.append(
+    dict(
+        abbr='compass_bench_cdoe_completion_en',
+        type=HumanevalDataset,
+        path='./data/compassbench_v1_3/coding/compass_bench_cdoe_completion/compass_bench_cdoe_completion_en.jsonl',
+        # local_mode=True,
+        reader_cfg=humaneval_plus_reader_cfg,
+        infer_cfg=humaneval_plus_infer_cfg,
+        eval_cfg=humaneval_plus_eval_cfg,
+    )
+)
+
+# ------------------------------------- Code Interview(LCBench --------------------------------------
+LC_difficulties_list = ['EASY', 'MEDIUM', 'HARD']
+LC_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column')
+
+
+LC_en_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with ["a","b","c"], we need to push the key one time to type "a", two times to type "b", and three times to type "c" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"),
+                dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+
+LC_cn_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务： 给你三个 正整数 n 、x 和 y 。\n在城市中，存在编号从 1 到 n 的房屋，由 n 条街道相连。对所有 1 <= i < n ，都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k（1 <= k <= n），你需要找出所有满足要求的 房屋对 [house1, house2] ，即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ，其中 result[k] 表示所有满足要求的房屋对的数量，即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意，x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: 给你一个字符串 word，由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射，可以通过按压按键来组成单词。例如，按键 2 对应 ["a","b","c"]，我们需要按一次键来输入 "a"，按两次键来输入 "b"，按三次键来输入 "c"。\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母，但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1，*，# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ，满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意：\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号，比方说如果 s == 11100 ，那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+
+LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
+
+for difficulty in LC_difficulties_list:
+    compassbench_v1_3_code_datasets.append(
+        dict(
+            type=LCDataset,
+            abbr='compass_bench_code_interview_en-' + difficulty,
+            path='./data/compassbench_v1_3/coding/compass_bench_code_interview/compass_bench_code_interview_en.jsonl',
+            difficulty=difficulty,
+            reader_cfg=LC_reader_cfg,
+            infer_cfg=LC_en_infer_cfg,
+            eval_cfg=LC_eval_cfg,
+        )
+    )
+    compassbench_v1_3_code_datasets.append(
+        dict(
+            type=LCDataset,
+            abbr='compass_bench_code_interview_zh-' + difficulty,
+            path='./data/compassbench_v1_3/coding/compass_bench_code_interview/compass_bench_code_interview_zh.jsonl',
+            difficulty=difficulty,
+            reader_cfg=LC_reader_cfg,
+            infer_cfg=LC_cn_infer_cfg,
+            eval_cfg=LC_eval_cfg,
+        )
+    )
+
+
+# --------------------------------------------Code Competition(TACO) ---------------------------------------------------------------
+TACO_difficulties_list = ['EASY', 'MEDIUM', 'MEDIUM_HARD', 'HARD', 'VERY_HARD']
+TACO_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test')
+
+TACO_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+TACO_eval_cfg = dict(evaluator=dict(type=TACOEvaluator), pred_role='BOT')
+
+for difficulty in TACO_difficulties_list:
+    compassbench_v1_3_code_datasets.append(
+        dict(
+            type=TACODataset,
+            abbr='TACO-' + difficulty,
+            path='./data/compassbench_v1_3/coding/compass_bench_code_competition',
+            difficulty=difficulty,
+            reader_cfg=TACO_reader_cfg,
+            infer_cfg=TACO_infer_cfg,
+            eval_cfg=TACO_eval_cfg,
+        )
+    )
--- a/configs/datasets/compassbench_v1_3/compassbench_v1_3_knowledge.py
+++ b/configs/datasets/compassbench_v1_3/compassbench_v1_3_knowledge.py
@ -0,0 +1,94 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets.compassbench_obj import (
+    CompassBenchObjectiveV1_3,
+    compassbench_objective_v1_3_postprocess,
+)
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+
+prompt_cn = {
+    'single_choice_cn': '以下是一道单项选择题，请你根据你了解的知识给出正确的答案选项。请你一步步推理并在最后用“答案选项为X”来回答，其中X是ABCD中你认为正确的选项序号\n下面是你要回答的题目：\n{question}\n让我们一步步解决这个问题：',
+    'cloze_cn': '以下是一道填空题，请你根据你了解的知识一步步思考后把你的最终答案放到\\boxed{}中。\n下面是你要回答的题目：\n{question}\n让我们一步步解决这个问题：',
+}
+
+prompt_en = {
+    'single_choice_en': "Here is a single-choice question. Please give the correct answer based on your knowledge. Please reason step by step and answer with 'The answer is X' at the end, where X is the option letter you think is correct.\nHere is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
+    'cloze_en': "Here is a fill-in-the-blank question. Please think step by step based on your knowledge and put your final answer in \\boxed{}. Here is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
+}
+
+douknow_sets = {
+    'wiki_en_sub_500_人文科学':['single_choice_en'],
+    'wiki_en_sub_500_社会科学':['single_choice_en'],
+    'wiki_en_sub_500_生活常识':['single_choice_en'],
+    'wiki_en_sub_500_自然科学-工科':['single_choice_en'],
+    'wiki_en_sub_500_自然科学-理科':['single_choice_en'],
+    'wiki_zh_sub_500_人文科学': ['single_choice_cn'],
+    'wiki_zh_sub_500_社会科学': ['single_choice_cn'],
+    'wiki_zh_sub_500_生活常识': ['single_choice_cn'],
+    'wiki_zh_sub_500_自然科学-工科':['single_choice_cn'],
+    'wiki_zh_sub_500_自然科学-理科':['single_choice_cn'],
+}
+
+data_path = './data/compassbench_v1_3/knowledge'
+
+# Set up the prompts
+CircularEval = True
+
+compassbench_knowledge_datasets = []
+
+for _split in list(douknow_sets.keys()):
+    for _name in douknow_sets[_split]:
+        if 'cn' in _name:
+            single_choice_prompts = prompt_cn
+            cloze_prompts = prompt_cn
+        else:
+            single_choice_prompts = prompt_en
+            cloze_prompts = prompt_en
+
+        if 'single_choice' in _name:
+            template_round = [dict(role='HUMAN', prompt=single_choice_prompts[_name])]
+            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+            evaluator = dict(type=CircularEvaluator if CircularEval else AccEvaluator)
+            dataset_name = _name + '_circular' if CircularEval else _name
+            dataset_abbr = (
+                'compassbench-' + _split + '_circular'
+                if CircularEval
+                else 'compassbench-' + _split
+            )
+        else:
+            template_round = [dict(role='HUMAN', prompt=cloze_prompts[_name])]
+            pred_postprocessor = dict(
+                type=compassbench_objective_v1_3_postprocess, name=_name
+            )
+            evaluator = dict(type=AccEvaluator)
+            dataset_name = _name
+            dataset_abbr = 'compassbench-' + _split
+
+        douknow_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate, template=dict(round=template_round)
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=2048),
+        )
+
+        douknow_eval_cfg = dict(
+            evaluator=evaluator,
+            pred_postprocessor=pred_postprocessor,
+        )
+
+        compassbench_knowledge_datasets.append(
+            dict(
+                type=CompassBenchObjectiveV1_3,
+                path=f'{data_path}/{_split}.jsonl',
+                name=dataset_name,
+                abbr=dataset_abbr,
+                reader_cfg=dict(input_columns=['question'], output_column='answer'),
+                infer_cfg=douknow_infer_cfg,
+                eval_cfg=douknow_eval_cfg,
+            )
+        )
+del _split, _name
--- a/configs/datasets/compassbench_v1_3/compassbench_v1_3_math.py
+++ b/configs/datasets/compassbench_v1_3/compassbench_v1_3_math.py
@ -0,0 +1,86 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets.compassbench_obj import CompassBenchObjectiveV1_3
+from opencompass.datasets import MATHEvaluator, math_postprocess_v2
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+
+prompt_cn = {
+    'single_choice_cn': '以下是一道单项选择题，请你根据你了解的知识给出正确的答案选项。请你一步步推理并在最后用“答案选项为X”来回答，其中X是ABCD中你认为正确的选项序号\n下面是你要回答的题目：\n{question}\n让我们一步步解决这个问题：',
+    'cloze_cn': '以下是一道数学计算题，请你一步一步计算，并在最后用\\boxed{}包裹并返回你计算的最终答案。\n下面是你要回答的题目：\n{question}\n让我们一步步解决这个问题：',
+}
+
+prompt_en = {
+    'single_choice_en': "Here is a single-choice question. Please give the correct answer based on your knowledge. Please reason step by step and answer with 'The answer is X' at the end, where X is the option number you think is correct.\nHere is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
+    'cloze_en': 'Here is a arithematic problem. Please reason step by step, and put your final answer within \\boxed{}. Here is the question you need to answer:\n{question}\nLet\'s solve this problem step by step:',
+}
+
+
+douknow_sets = {
+    'arithmetic_cloze_en': ['cloze_en'],
+    'college_single_choice_en': ['single_choice_en'],
+    'college_single_choice_cn': ['single_choice_cn'],
+}
+
+data_path = './data/compassbench_v1_3/math'
+
+# Set up the prompts
+CircularEval = True
+
+compassbench_math_datasets = []
+
+for _split in list(douknow_sets.keys()):
+    for _name in douknow_sets[_split]:
+        if 'cn' in _name:
+            single_choice_prompts = prompt_cn
+            cloze_prompts = prompt_cn
+        else:
+            single_choice_prompts = prompt_en
+            cloze_prompts = prompt_en
+
+        if 'single_choice' in _name:
+            template_round = [dict(role='HUMAN', prompt=single_choice_prompts[_name])]
+            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+            evaluator = dict(type=CircularEvaluator if CircularEval else AccEvaluator)
+            dataset_name = _name + '_circular' if CircularEval else _name
+            dataset_abbr = (
+                'compassbench-' + _split + '_circular'
+                if CircularEval
+                else 'compassbench-' + _split
+            )
+        else:
+            template_round = [dict(role='HUMAN', prompt=cloze_prompts[_name])]
+            pred_postprocessor = dict(
+                type=math_postprocess_v2,
+            )
+            evaluator = dict(type=MATHEvaluator)
+            dataset_name = _name
+            dataset_abbr = 'compassbench-' + _split
+
+        douknow_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate, template=dict(round=template_round)
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=2048),
+        )
+
+        douknow_eval_cfg = dict(
+            evaluator=evaluator,
+            pred_postprocessor=pred_postprocessor,
+        )
+
+        compassbench_math_datasets.append(
+            dict(
+                type=CompassBenchObjectiveV1_3,
+                path=f'{data_path}/{_split}.jsonl',
+                name=dataset_name,
+                abbr=dataset_abbr,
+                reader_cfg=dict(input_columns=['question'], output_column='answer'),
+                infer_cfg=douknow_infer_cfg,
+                eval_cfg=douknow_eval_cfg,
+            )
+        )
+del _split, _name
--- a/configs/datasets/compassbench_v1_3/compassbench_v1_3_prompt.py
+++ b/configs/datasets/compassbench_v1_3/compassbench_v1_3_prompt.py
@ -0,0 +1,44 @@
+FORCE_STOP_PROMPT_EN = (
+    """You should directly give results based on history information."""
+)
+
+FEWSHOT_INSTRUCTION = """\
+You are an assistant who can utilize external tools.
+{tool_description}
+To use a tool, please response with the following format:
+```
+{thought} Think what you need to solve, do you need to use tools?
+{action} The tool name, should be one of [{action_names}].
+{action_input} The input to the tool that you want to use.
+```
+The tool will give you response after your response using the following format:
+```
+{response} the results after call the tool.
+```
+Therefore DO NOT generate tool response by yourself.
+
+Also please follow the guidelines:
+1. Always use code interpreter to solve the problem.
+2. The generated codes should always in a markdown code block format.
+3. The generated codes will be executed in an ipython manner and the results will be cached.
+4. Your responded code should always be simple and only solves the problem in current step.
+
+For example:
+
+File url: `xxxx`
+### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
+
+{thought} We should use `pandas` to solve this step.
+{action} IPythonInterpreter
+{action_input} ```python
+import pandas as pd
+url = "xxxx"
+data = pd.read_csv(url)
+```
+{response} The code is succeed without any outputs.
+
+Let us begin from here!
+"""
+
+IPYTHON_INTERPRETER_DESCRIPTION = """\
+It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method."""
--- a/configs/datasets/gpqa/gpqa_openai_simple_evals_gen_5aeece.py
+++ b/configs/datasets/gpqa/gpqa_openai_simple_evals_gen_5aeece.py
@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import GPQASimpleEvalDataset, GPQA_Simple_Eval_postprocess, GPQAEvaluator
+from opencompass.datasets import GPQADataset, GPQA_Simple_Eval_postprocess, GPQAEvaluator

 # openai_simple_eval prompt
 align_prompt = """
@ -43,7 +43,7 @@ for split in list(gpqa_subsets.keys()):
    gpqa_datasets.append(
        dict(
            abbr='GPQA_' + split,
-            type=GPQASimpleEvalDataset,
+            type=GPQADataset,
            path='./data/gpqa/',
            name=gpqa_subsets[split],
            reader_cfg=gpqa_reader_cfg,
--- a/configs/datasets/mmlu_pro/mmlu_pro_0shot_cot_gen_08c1de.py
+++ b/configs/datasets/mmlu_pro/mmlu_pro_0shot_cot_gen_08c1de.py
@ -0,0 +1,64 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUProDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+
+with read_base():
+    from .mmlu_pro_categories import categories
+
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+
+Question:\n
+{question}
+
+Options:\n
+{options_str}
+
+""".strip()
+
+mmlu_pro_datasets = []
+
+for category in categories:
+    mmlu_pro_reader_cfg = dict(
+        input_columns=['question', 'cot_content', 'options_str'],
+        output_column='answer',
+        train_split='validation',
+        test_split='test',
+    )
+    mmlu_pro_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN',
+                         prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    mmlu_pro_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(
+            type=match_answer_pattern,
+            answer_pattern=r'(?i)ANSWER\s*:\s*([A-P])')
+    )
+
+    mmlu_pro_datasets.append(
+        dict(
+            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
+            type=MMLUProDataset,
+            path='opencompass/mmlu_pro',
+            category=category,
+            reader_cfg=mmlu_pro_reader_cfg,
+            infer_cfg=mmlu_pro_infer_cfg,
+            eval_cfg=mmlu_pro_eval_cfg,
+        ))
--- a/configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py
+++ b/configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py
@ -51,6 +51,7 @@ for category in categories:
        dict(
            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
            type=MMLUProDataset,
+            path='opencompass/mmlu_pro',
            category=category,
            reader_cfg=mmlu_pro_reader_cfg,
            infer_cfg=mmlu_pro_infer_cfg,
--- a/configs/datasets/race/race_cot_gen_d95929.py
+++ b/configs/datasets/race/race_cot_gen_d95929.py
@ -0,0 +1,68 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import RaceDataset
+from opencompass.utils.text_postprocessors import (
+    first_option_postprocess,
+)
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+
+Article: {article}
+
+Q: {question}
+
+A. {A}
+B. {B}
+C. {C}
+D. {D}
+""".strip()
+
+race_reader_cfg = dict(
+    input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
+    output_column='answer',
+    train_split='validation',
+    test_split='test',
+)
+
+race_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+race_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+    pred_role='BOT',
+)
+
+race_datasets = [
+    dict(
+        abbr='race-middle',
+        type=RaceDataset,
+        path='opencompass/race',
+        name='middle',
+        reader_cfg=race_reader_cfg,
+        infer_cfg=race_infer_cfg,
+        eval_cfg=race_eval_cfg,
+    ),
+    dict(
+        abbr='race-high',
+        type=RaceDataset,
+        path='opencompass/race',
+        name='high',
+        reader_cfg=race_reader_cfg,
+        infer_cfg=race_infer_cfg,
+        eval_cfg=race_eval_cfg,
+    ),
+]
--- a/configs/datasets/race/race_few_shot_gen_a498ed.py
+++ b/configs/datasets/race/race_few_shot_gen_a498ed.py
@ -0,0 +1,53 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import RaceDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+race_reader_cfg = dict(
+    input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
+    output_column='answer',
+    train_split='validation',
+    test_split='test'
+)
+
+race_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin='</E>',
+            round=[
+                dict(role='HUMAN', prompt='Read the article, and answer the question by replying A, B, C or D.\n\nArticle:\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer:'),
+                dict(role='BOT', prompt='{answer}'),
+            ]
+        ),
+        ice_token='</E>',
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4]),
+    inferencer=dict(type=GenInferencer, max_out_len=50),
+)
+
+race_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=first_capital_postprocess),
+    pred_role='BOT')
+
+race_datasets = [
+    dict(
+        abbr='race-middle',
+        type=RaceDataset,
+        path='opencompass/race',
+        name='middle',
+        reader_cfg=race_reader_cfg,
+        infer_cfg=race_infer_cfg,
+        eval_cfg=race_eval_cfg),
+    dict(
+        abbr='race-high',
+        type=RaceDataset,
+        path='opencompass/race',
+        name='high',
+        reader_cfg=race_reader_cfg,
+        infer_cfg=race_infer_cfg,
+        eval_cfg=race_eval_cfg)
+]
--- a/configs/datasets/rolebench/instruction_generalization_eng.py
+++ b/configs/datasets/rolebench/instruction_generalization_eng.py
@ -33,6 +33,7 @@ instruction_generalization_eng_eval_cfg = dict(

 instruction_generalization_eng_datasets = [
    dict(
+        abbr='RoleBench_instruct_eng',
        type=InstructionGeneralizationEnglishDataset,
        path='ZenMoore/RoleBench',
        reader_cfg=instruction_generalization_eng_reader_cfg,
--- a/configs/datasets/rolebench/instruction_generalization_zh.py
+++ b/configs/datasets/rolebench/instruction_generalization_zh.py
@ -33,6 +33,7 @@ instruction_generalization_zh_eval_cfg = dict(

 instruction_generalization_zh_datasets = [
    dict(
+        abbr='RoleBench_instruct_zh',
        type=InstructionGeneralizationChineseDataset,
        path='ZenMoore/RoleBench',
        reader_cfg=instruction_generalization_zh_reader_cfg,
--- a/configs/datasets/rolebench/role_generalization_eng.py
+++ b/configs/datasets/rolebench/role_generalization_eng.py
@ -33,6 +33,7 @@ role_generalization_eng_eval_cfg = dict(

 role_generalization_eng_datasets = [
    dict(
+        abbr='RoleBench_role_eng',
        type=RoleGeneralizationEnglishDataset,
        path='ZenMoore/RoleBench',
        reader_cfg=role_generalization_eng_reader_cfg,
--- a/configs/datasets/ruler/README.md
+++ b/configs/datasets/ruler/README.md
@ -0,0 +1,14 @@
+# Ruler
+OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations.
+
+OpenCompass have providied two types of evaluation demo for using different tokenizers.
+
+For using the same tokenizer (typicall GPT-4), you can follow the demo (configs/eval_ruler_fix_tokenizer.py) where most of the settings are already defined.
+
+
+For evaluation using each model's own tokenizer, you have to build the settings when you run the demo (we do not know which model you are trying to evaluate!) you can create a new evaluation script following the example (configs/eval_ruler.py) and change the context window sizes or add models according to your settings.
+
+```bash
+python run.py configs/eval_ruler_fix_tokenizer.py # For evaluation with GPT-4 tokenizer
+python run.py configs/eval_ruler.py # For evaluation with model's tokenizer
+```
--- a/configs/datasets/ruler/ruler_128k_gen.py
+++ b/configs/datasets/ruler/ruler_128k_gen.py
@ -0,0 +1,28 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .ruler_niah_gen import niah_datasets  # Niah
+    from .ruler_vt_gen import vt_datasets  # VT
+    from .ruler_fwe_gen import fwe_datasets  # FWE
+    from .ruler_cwe_gen import cwe_datasets  # CWE
+    from .ruler_qa_gen import qa_datasets  # QA
+
+
+import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+# Evaluation config
+NUM_SAMPLES = 100  # Change to the number of samples you need
+# Change the context lengths to be tested
+max_seq_lens = [1024 * 128]
+abbr_suffixs = ['128k']
+
+ruler_datasets = []
+
+# Different seq length
+for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
+    for dataset in import_datasets:
+        tmp_dataset = dataset.deepcopy()
+        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
+        tmp_dataset['num_samples'] = NUM_SAMPLES
+        tmp_dataset['max_seq_length'] = max_seq_len
+        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_16k_gen.py
+++ b/configs/datasets/ruler/ruler_16k_gen.py
@ -0,0 +1,29 @@
+
+from mmengine.config import read_base
+
+with read_base():
+    from .ruler_niah_gen import niah_datasets  # Niah
+    from .ruler_vt_gen import vt_datasets  # VT
+    from .ruler_fwe_gen import fwe_datasets  # FWE
+    from .ruler_cwe_gen import cwe_datasets  # CWE
+    from .ruler_qa_gen import qa_datasets  # QA
+
+
+import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+# Evaluation config
+NUM_SAMPLES = 100 # Change to the number of samples you need
+# Change the context lengths to be tested
+max_seq_lens = [1024 * 16]
+abbr_suffixs = ['16k']
+
+ruler_datasets = []
+
+# Different seq length
+for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
+    for dataset in import_datasets:
+        tmp_dataset = dataset.deepcopy()
+        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
+        tmp_dataset['num_samples'] = NUM_SAMPLES
+        tmp_dataset['max_seq_length'] = max_seq_len
+        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_1m_gen.py
+++ b/configs/datasets/ruler/ruler_1m_gen.py
@ -0,0 +1,29 @@
+
+from mmengine.config import read_base
+
+with read_base():
+    from .ruler_niah_gen import niah_datasets  # Niah
+    from .ruler_vt_gen import vt_datasets  # VT
+    from .ruler_fwe_gen import fwe_datasets  # FWE
+    from .ruler_cwe_gen import cwe_datasets  # CWE
+    from .ruler_qa_gen import qa_datasets  # QA
+
+
+import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+# Evaluation config
+NUM_SAMPLES = 100 # Change to the number of samples you need
+# Change the context lengths to be tested
+max_seq_lens = [1024 * 1024]
+abbr_suffixs = ['1m']
+
+ruler_datasets = []
+
+# Different seq length
+for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
+    for dataset in import_datasets:
+        tmp_dataset = dataset.deepcopy()
+        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
+        tmp_dataset['num_samples'] = NUM_SAMPLES
+        tmp_dataset['max_seq_length'] = max_seq_len
+        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_32k_gen.py
+++ b/configs/datasets/ruler/ruler_32k_gen.py
@ -0,0 +1,29 @@
+
+from mmengine.config import read_base
+
+with read_base():
+    from .ruler_niah_gen import niah_datasets  # Niah
+    from .ruler_vt_gen import vt_datasets  # VT
+    from .ruler_fwe_gen import fwe_datasets  # FWE
+    from .ruler_cwe_gen import cwe_datasets  # CWE
+    from .ruler_qa_gen import qa_datasets  # QA
+
+
+import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+# Evaluation config
+NUM_SAMPLES = 100 # Change to the number of samples you need
+# Change the context lengths to be tested
+max_seq_lens = [1024 * 32]
+abbr_suffixs = ['32k']
+
+ruler_datasets = []
+
+# Different seq length
+for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
+    for dataset in import_datasets:
+        tmp_dataset = dataset.deepcopy()
+        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
+        tmp_dataset['num_samples'] = NUM_SAMPLES
+        tmp_dataset['max_seq_length'] = max_seq_len
+        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_4k_gen.py
+++ b/configs/datasets/ruler/ruler_4k_gen.py
@ -0,0 +1,28 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .ruler_niah_gen import niah_datasets  # Niah
+    from .ruler_vt_gen import vt_datasets  # VT
+    from .ruler_fwe_gen import fwe_datasets  # FWE
+    from .ruler_cwe_gen import cwe_datasets  # CWE
+    from .ruler_qa_gen import qa_datasets  # QA
+
+
+import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+# Evaluation config
+NUM_SAMPLES = 100  # Change to the number of samples you need
+# Change the context lengths to be tested
+max_seq_lens = [1024 * 4]
+abbr_suffixs = ['4k']
+
+ruler_datasets = []
+
+# Different seq length
+for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
+    for dataset in import_datasets:
+        tmp_dataset = dataset.deepcopy()
+        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
+        tmp_dataset['num_samples'] = NUM_SAMPLES
+        tmp_dataset['max_seq_length'] = max_seq_len
+        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_8k_gen.py
+++ b/configs/datasets/ruler/ruler_8k_gen.py
@ -0,0 +1,29 @@
+
+from mmengine.config import read_base
+
+with read_base():
+    from .ruler_niah_gen import niah_datasets  # Niah
+    from .ruler_vt_gen import vt_datasets  # VT
+    from .ruler_fwe_gen import fwe_datasets  # FWE
+    from .ruler_cwe_gen import cwe_datasets  # CWE
+    from .ruler_qa_gen import qa_datasets  # QA
+
+
+import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+# Evaluation config
+NUM_SAMPLES = 100 # Change to the number of samples you need
+# Change the context lengths to be tested
+max_seq_lens = [1024 * 8]
+abbr_suffixs = ['8k']
+
+ruler_datasets = []
+
+# Different seq length
+for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
+    for dataset in import_datasets:
+        tmp_dataset = dataset.deepcopy()
+        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
+        tmp_dataset['num_samples'] = NUM_SAMPLES
+        tmp_dataset['max_seq_length'] = max_seq_len
+        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_combined_gen.py
+++ b/configs/datasets/ruler/ruler_combined_gen.py
@ -0,0 +1,13 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .ruler_4k_gen import ruler_datasets as ruler_4k_datasets
+    from .ruler_8k_gen import ruler_datasets as ruler_8k_datasets
+    from .ruler_16k_gen import ruler_datasets as ruler_16k_datasets
+    from .ruler_32k_gen import ruler_datasets as ruler_32k_datasets
+    from .ruler_128k_gen import ruler_datasets as ruler_128k_datasets
+    from .ruler_1m_gen import ruler_datasets as ruler_1m_datasets
+
+ruler_combined_datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')), []
+)
--- a/configs/datasets/ruler/ruler_cwe_gen.py
+++ b/configs/datasets/ruler/ruler_cwe_gen.py
@ -0,0 +1,34 @@
+from opencompass.datasets.ruler.ruler_cwe import RulerCweDataset
+from opencompass.datasets.ruler.ruler_cwe import RulerCweEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+# CWE Dataset
+cwe_datasets = [
+    {
+        'abbr': 'ruler_cwe',
+        'type': RulerCweDataset,
+        'freq_cw': 30,
+        'freq_ucw': 3,
+        'num_cw': 10,
+        'tokens_to_generate': 120,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=RulerCweEvaluator),
+        ),
+    }
+]
--- a/configs/datasets/ruler/ruler_fwe_gen.py
+++ b/configs/datasets/ruler/ruler_fwe_gen.py
@ -0,0 +1,33 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.ruler.ruler_fwe import RulerFweDataset
+from opencompass.datasets.ruler.ruler_fwe import RulerFweEvaluator
+
+# FWE Dataset
+fwe_datasets = [
+    {
+        'abbr': 'ruler_fwe',
+        'type': RulerFweDataset,
+        'tokens_to_generate': 50,
+        'alpha': 2.0,
+        'coded_wordlen': 6,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=RulerFweEvaluator),
+        ),
+    }
+]
--- a/configs/datasets/ruler/ruler_niah_gen.py
+++ b/configs/datasets/ruler/ruler_niah_gen.py
@ -0,0 +1,123 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset
+from opencompass.datasets.ruler.ruler_niah import RulerNiahEvaluator
+
+
+# Ruler Dataset settings
+niah_configurations = [
+    {
+        'abbr': 'single_1',
+        'type_haystack': 'repeat',
+        'type_needle_k': 'words',
+        'type_needle_v': 'numbers',
+        'num_needle_k': 1,
+        'num_needle_v': 1,
+        'num_needle_q': 1,
+    },
+    {
+        'abbr': 'single_2',
+        'type_haystack': 'essay',
+        'type_needle_k': 'words',
+        'type_needle_v': 'numbers',
+        'num_needle_k': 1,
+        'num_needle_v': 1,
+        'num_needle_q': 1,
+    },
+    {
+        'abbr': 'single_3',
+        'type_haystack': 'essay',
+        'type_needle_k': 'words',
+        'type_needle_v': 'uuids',
+        'num_needle_k': 1,
+        'num_needle_v': 1,
+        'num_needle_q': 1,
+    },
+    {
+        'abbr': 'multikey_1',
+        'type_haystack': 'essay',
+        'type_needle_k': 'words',
+        'type_needle_v': 'numbers',
+        'num_needle_k': 4,
+        'num_needle_v': 1,
+        'num_needle_q': 1,
+    },
+    {
+        'abbr': 'multikey_2',
+        'type_haystack': 'needle',
+        'type_needle_k': 'words',
+        'type_needle_v': 'numbers',
+        'num_needle_k': 1,
+        'num_needle_v': 1,
+        'num_needle_q': 1,
+    },
+    {
+        'abbr': 'multikey_3',
+        'type_haystack': 'needle',
+        'type_needle_k': 'uuids',
+        'type_needle_v': 'uuids',
+        'num_needle_k': 1,
+        'num_needle_v': 1,
+        'num_needle_q': 1,
+    },
+    {
+        'abbr': 'multivalue',
+        'type_haystack': 'essay',
+        'type_needle_k': 'words',
+        'type_needle_v': 'numbers',
+        'num_needle_k': 1,
+        'num_needle_v': 4,
+        'num_needle_q': 1,
+    },
+    {
+        'abbr': 'multiquery',
+        'type_haystack': 'essay',
+        'type_needle_k': 'words',
+        'type_needle_v': 'numbers',
+        'num_needle_k': 1,
+        'num_needle_v': 1,
+        'num_needle_q': 4,
+    },
+]
+
+niah_datasets = []
+
+# NIAH Dataset
+base_path = './data/ruler'
+file_path = 'PaulGrahamEssays.jsonl'
+for index, config in enumerate(niah_configurations):
+    dataset_dict = {
+        'abbr': f'ruler_niah_{config["abbr"]}',
+        'type': RulerNiahDataset,
+        'base_path': base_path,
+        'file_path': file_path,
+        # 'tokenizer_model': model_path,
+        'tokens_to_generate': 128,
+        # 'max_seq_length': max_seq_len,
+        # 'num_samples': NUM_SAMPLES,
+        'type_haystack': config['type_haystack'],
+        'type_needle_k': config['type_needle_k'],
+        'type_needle_v': config['type_needle_v'],
+        'num_needle_k': config['num_needle_k'],
+        'num_needle_v': config['num_needle_v'],
+        'num_needle_q': config['num_needle_q'],
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=RulerNiahEvaluator),
+        ),
+    }
+    niah_datasets.append(dataset_dict)
--- a/configs/datasets/ruler/ruler_qa_gen.py
+++ b/configs/datasets/ruler/ruler_qa_gen.py
@ -0,0 +1,38 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.ruler.ruler_qa import RulerQaDataset
+from opencompass.datasets.ruler.ruler_qa import RulerQaEvaluator
+
+qa_configurations = [
+    {'dataset': 'squad', 'path': './data/ruler/dev-v2.0.json'},
+    {'dataset': 'hotpotqa', 'path': './data/ruler/hotpotqa.json'},
+]
+
+qa_datasets = []
+for index, config in enumerate(qa_configurations):
+    dataset_dict = {
+        'abbr': f'ruler_qa_{config["dataset"]}',
+        'dataset': config['dataset'],
+        'path': config['path'],
+        'type': RulerQaDataset,
+        'tokens_to_generate': 50,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=RulerQaEvaluator),
+        ),
+    }
+    qa_datasets.append(dataset_dict)
--- a/configs/datasets/ruler/ruler_vt_gen.py
+++ b/configs/datasets/ruler/ruler_vt_gen.py
@ -0,0 +1,32 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.ruler.ruler_vt import RulerVtDataset
+from opencompass.datasets.ruler.ruler_vt import RulerVtEvaluator
+
+# VT Dataset
+vt_datasets = [
+    {
+        'abbr': 'ruler_vt',
+        'type': RulerVtDataset,
+        'num_chains': 1,
+        'num_hops': 4,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=RulerVtEvaluator),
+        ),
+    }
+]
--- a/configs/datasets/scicode/README.md
+++ b/configs/datasets/scicode/README.md
@ -0,0 +1,31 @@
+# SciCode: A Research Coding Benchmark Curated by Scientists
+
+## Introduction
+SciCode is a challenging benchmark designed to evaluate the capabilities of language models (LMs) in generating code for solving realistic scientific research problems. It has a diverse coverage of 16 subdomains from 6 domains: Physics, Math, Material Science, Biology, and Chemistry. Unlike previous benchmarks that consist of exam-like question-answer pairs, SciCode is converted from real research problems. SciCode problems naturally factorize into multiple subproblems, each involving knowledge recall, reasoning, and code synthesis. In total, SciCode contains 338 subproblems decomposed from 80 challenging main problems, and it offers optional descriptions specifying useful scientific background information and scientist-annotated gold-standard solutions and test cases for evaluation. Claude3.5-Sonnet, the best-performing model among those tested, can solve only 4.6% of the problems in the most realistic setting. Broadly, SciCode demonstrates a realistic and scientists' everyday workflow of identifying critical science concepts and facts and then transforming them into computation and simulation code. We believe SciCode not only helps demonstrate contemporary LLMs' progress towards helpful assistant for scientists but also helps shed light on future building and evaluation of scientific AI. For more detailed information, please refer to https://scicode-bench.github.io/.
+
+## How to Use
+By modifying the with_bg parameter in the configuration file, you can support setup for w/ background evaluation.
+
+```bash
+python run.py --datasets scicode_gen --hf-num-gpus 1 --hf-type chat --hf-path meta-llama/Meta-Llama-3-8B-Instruct --debug --model-kwargs device_map='auto' trust_remote_code=True --batch-size 1
+```
+
+## Reference Performance
+| Model                     | Condition    | Subproblem Accuracy | Main Problem Accuracy |
+|---------------------------|--------------|---------------------|-----------------------|
+| Llama-3-70B-Instruct      | w/o Background  | 21.53%              | 4.62%                  |
+| Llama-3-70B-Instruct      | w/ Background   | 24.31%              | 7.69%                  |
+| Qwen2-72B-Instruct        | w/o Background  | 16.67%              | 1.54%                  |
+| Qwen2-72B-Instruct        | w/ Background   | 19.79%              | 1.54%                  |
+
+## Citation
+```
+@misc{tian2024scicode,
+    title={SciCode: A Research Coding Benchmark Curated by Scientists},
+    author={Minyang Tian and Luyu Gao and Shizhuo Dylan Zhang and Xinan Chen and Cunwei Fan and Xuefei Guo and Roland Haas and Pan Ji and Kittithat Krongchon and Yao Li and Shengyan Liu and Di Luo and Yutao Ma and Hao Tong and Kha Trinh and Chenyu Tian and Zihan Wang and Bohao Wu and Yanyu Xiong and Shengzhu Yin and Minhui Zhu and Kilian Lieret and Yanxin Lu and Genglin Liu and Yufeng Du and Tianhua Tao and Ofir Press and Jamie Callan and Eliu Huerta and Hao Peng},
+    year={2024},
+    eprint={2407.13168},
+    archivePrefix={arXiv},
+    primaryClass={cs.AI}
+}
+```
--- a/configs/datasets/scicode/scicode_gen.py
+++ b/configs/datasets/scicode/scicode_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .scicode_gen_085b98 import SciCode_datasets  # noqa: F401, F403
--- a/configs/datasets/scicode/scicode_gen_085b98.py
+++ b/configs/datasets/scicode/scicode_gen_085b98.py
@ -0,0 +1,29 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer
+from opencompass.datasets import SciCodeDataset, SciCodeEvaluator
+
+
+SciCode_reader_cfg = dict(input_columns=['prompt'], output_column=None)
+
+SciCode_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template='',
+        ),
+
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=ChatInferencer, infer_mode='every', max_out_len=4096))
+
+SciCode_eval_cfg = dict(evaluator=dict(type=SciCodeEvaluator, dataset_path='./data/scicode', with_bg=False))
+
+SciCode_datasets = [
+    dict(
+        abbr='SciCode',
+        type=SciCodeDataset,
+        path='./data/scicode',
+        with_bg=False,
+        reader_cfg=SciCode_reader_cfg,
+        infer_cfg=SciCode_infer_cfg,
+        eval_cfg=SciCode_eval_cfg)
+]
--- a/configs/datasets/subjective/compassarena/compassarena_compare.py
+++ b/configs/datasets/subjective/compassarena/compassarena_compare.py
@ -150,5 +150,5 @@ for _name, _prompt in sub_map.items():
            infer_order='double',
            base_models=gpt4,
            summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
-            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
+            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
        ))
--- a/configs/datasets/subjective/compassbench/compassbench_checklist.py
+++ b/configs/datasets/subjective/compassbench/compassbench_checklist.py
@ -23,7 +23,6 @@ subjective_all_sets = {
        'coding/compass_bench_coding_cn_val',
    ],
 }
-
 data_path = './data/compassbench_v1_3/'

 pair_prompt_en = """# Instruction
@ -184,7 +183,7 @@ pair_prompt_cn = """# 指令
 checklist_datasets = []
 gpt4 = [
    dict(
-        abbr='gpt4o',
+        abbr='gpt4-1106',
    )
 ]
 for lan, data_name_list in subjective_all_sets.items():
--- a/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
+++ b/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
@ -0,0 +1,99 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import FofoDataset
+from opencompass.summarizers import FofoSummarizer
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'fofo_test_prompts', 'fofo_test_prompts_cn',
+]
+
+base_prompt = """
+I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
+
+Here is the prompt:
+{
+    "instruction": "{question}",
+}
+
+Here are the outputs of the models:
+[
+    {
+        "model": "model",
+        "answer": "{prediction}"
+    },
+]
+
+Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
+```json
+[
+    {
+        'model': <model-name>,
+        'format_correctness': <correctness>,
+        'reasons': <reasons-of-format-correctness>
+    }
+]
+```
+
+Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
+"""
+
+fofo_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = base_prompt
+                    ),
+                ]),
+            ),
+        ),
+        pred_role='BOT',
+    )
+
+    fofo_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=FofoDataset,
+            path='./data/subjective/fofo',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+            summarizer = dict(type=FofoSummarizer, judge_type='general')
+        ))
--- a/configs/datasets/subjective/fofo/fofo_judge.py
+++ b/configs/datasets/subjective/fofo/fofo_judge.py
@ -12,7 +12,7 @@ subjective_reader_cfg = dict(
    )

 subjective_all_sets = [
-    'fofo_test_prompts', 'fofo_test_prompts_cn',
+    'fofo_test_prompts'
 ]

 base_prompt = """
--- a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
+++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
@ -11,7 +11,7 @@ subjective_reader_cfg = dict(
    )


-data_path ='./data/WildBench/wildbench.jsonl'
+data_path ='./data/subjective/WildBench/wildbench.jsonl'

 wildbench_datasets = []
 subjective_infer_cfg = dict(
@ -54,11 +54,11 @@ wildbench_datasets.append(
        reader_cfg=subjective_reader_cfg,
        infer_cfg=subjective_infer_cfg,
        eval_cfg=subjective_eval_cfg,
-        given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/WildBench/gpt4'},
-                {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'},
-                {'abbr': 'HaiKu', 'path':'./data/WildBench/claude'},
-                {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'},
-                {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}],
+        given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
+                {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
+                {'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
+                {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
+                {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
        mode='m2n', # m个模型 与 n个模型进行对战
        infer_order='random',
        base_models = [llama_2_70b, gpt4, claude]
--- a/configs/datasets/truthfulqa/truthfulqa_gen_5ddc62.py
+++ b/configs/datasets/truthfulqa/truthfulqa_gen_5ddc62.py
@ -18,19 +18,16 @@ truthfulqa_infer_cfg = dict(
    inferencer=dict(type=GenInferencer))

 # Metrics such as 'truth' and 'info' needs
-# OPENAI_API_KEY with finetuned models in it.
-# Please use your own finetuned openai model with keys and refers to
+# extra judge models.
+# Please use your own finetuned model  and refers to
 # the source code of `TruthfulQAEvaluator` for more details.
-#
+
 # If you cannot provide available models for 'truth' and 'info',
 # and want to perform basic metric eval, please set
 # `metrics=('bleurt', 'rouge', 'bleu')`
-
-# When key is set to "ENV", the key will be fetched from the environment
-# variable $OPENAI_API_KEY. Otherwise, set key in here directly.
 truthfulqa_eval_cfg = dict(
    evaluator=dict(
-        type=TruthfulQAEvaluator, metrics=('truth', 'info'), key='ENV'), )
+        type=TruthfulQAEvaluator, metrics=('bleu'), key='ENV'), )

 truthfulqa_datasets = [
    dict(
--- a/configs/datasets/winograd/winograd_ppl_8f3049.py
+++ b/configs/datasets/winograd/winograd_ppl_8f3049.py
@ -29,6 +29,7 @@ winograd_datasets = [
        abbr='winograd',
        type=WinogradDataset,
        path='winograd_wsc',
+        trust_remote_code=True,
        name='wsc285',
        reader_cfg=winograd_reader_cfg,
        infer_cfg=winograd_infer_cfg,
--- a/configs/datasets/winograd/winograd_ppl_b6c7ed.py
+++ b/configs/datasets/winograd/winograd_ppl_b6c7ed.py
@ -33,6 +33,7 @@ winograd_datasets = [
        abbr='winograd',
        type=WinogradDataset,
        path='winograd_wsc',
+        trust_remote_code=True,
        name='wsc285',
        reader_cfg=winograd_reader_cfg,
        infer_cfg=winograd_infer_cfg,
--- a/configs/eval_TheoremQA.py
+++ b/configs/eval_TheoremQA.py
@ -1,12 +1,12 @@
 from mmengine.config import read_base

 with read_base():
-    from .models.mistral.hf_mistral_7b_v0_1 import models as hf_mistral_7b_v0_1_model
-    from .models.mistral.hf_mistral_7b_v0_2 import models as hf_mistral_7b_v0_2_model
-    from .models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b_model
-    from .models.hf_internlm.hf_internlm2_math_20b import models as hf_internlm2_math_20b_model
+    from opencompass.configs.models.mistral.hf_mistral_7b_v0_1 import models as hf_mistral_7b_v0_1_model
+    from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import models as hf_mistral_7b_v0_2_model
+    from opencompass.configs.models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b_model
+    from opencompass.configs.models.hf_internlm.hf_internlm2_math_20b import models as hf_internlm2_math_20b_model

-    from .datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets as datasets
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets as datasets

 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

--- a/configs/eval_academic_leaderboard_202407.py
+++ b/configs/eval_academic_leaderboard_202407.py
@ -0,0 +1,196 @@
+from mmengine.config import read_base
+import os.path as osp
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    ## Core Set
+    # ## Examination
+    from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets
+    # ## Reasoning
+    from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
+    # ## Math
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
+    # ## Coding
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    # ## Instruction Following
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
+
+    # Summarizer
+    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
+    from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups
+    from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
+
+
+    # Model List
+    # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
+    # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
+    # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+# with read_base():
+
+core_summary_groups = [
+    {
+        'name': 'core_average',
+        'subsets': [
+            ['mmlu', 'accuracy'],
+            ['mmlu_pro', 'accuracy'],
+            # ['cmmlu', 'naive_average'],
+            ['cmmlu', 'accuracy'],
+            ['bbh', 'score'],
+            ['math', 'accuracy'],
+            ['openai_humaneval', 'humaneval_pass@1'],
+            ['GPQA_diamond', 'accuracy'],
+            ['IFEval', 'Prompt-level-strict-accuracy'],
+        ],
+    },
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['core_average', 'naive_average'],
+        ['mmlu', 'accuracy'],
+        ['mmlu_pro', 'accuracy'],
+        ['cmmlu', 'accuracy'],
+        ['bbh', 'score'],
+        ['math', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['GPQA_diamond', 'accuracy'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        '',
+
+        ['mmlu', 'accuracy'],
+        ['mmlu-stem', 'accuracy'],
+        ['mmlu-social-science', 'accuracy'],
+        ['mmlu-humanities', 'accuracy'],
+        ['mmlu-other', 'accuracy'],
+
+        '',
+        ['mmlu_pro', 'accuracy'],
+        ['mmlu_pro_math','accuracy'],
+        ['mmlu_pro_physics', 'accuracy'],
+        ['mmlu_pro_chemistry', 'accuracy'],
+        ['mmlu_pro_law', 'accuracy'],
+        ['mmlu_pro_engineering', 'accuracy'],
+        ['mmlu_pro_other', 'accuracy'],
+        ['mmlu_pro_economics', 'accuracy'],
+        ['mmlu_pro_health', 'accuracy'],
+        ['mmlu_pro_psychology', 'accuracy'],
+        ['mmlu_pro_business', 'accuracy'],
+        ['mmlu_pro_biology', 'accuracy'],
+        ['mmlu_pro_philosophy', 'accuracy'],
+        ['mmlu_pro_computer_science','accuracy'],
+        ['mmlu_pro_history', 'accuracy'],
+        '',
+        ['cmmlu', 'accuracy'],
+        ['cmmlu-stem', 'accuracy'],
+        ['cmmlu-social-science', 'accuracy'],
+        ['cmmlu-humanities', 'accuracy'],
+        ['cmmlu-other', 'accuracy'],
+        ['cmmlu-china-specific', 'accuracy'],
+        '',
+        ['bbh', 'extract_rate'],
+        ['math', 'extract_rate'],
+        # ['openai_humaneval', 'extract_rate'],
+        ['GPQA_diamond', 'extract_rate'],
+        # ['IFEval', 'extract_rate'],
+        '',
+        ['mmlu', 'extract_rate'],
+        ['mmlu-stem', 'extract_rate'],
+        ['mmlu-social-science', 'extract_rate'],
+        ['mmlu-humanities', 'extract_rate'],
+        ['mmlu-other', 'extract_rate'],
+        '',
+        ['mmlu_pro', 'extract_rate'],
+        ['mmlu_pro_math', 'extract_rate'],
+        ['mmlu_pro_physics', 'extract_rate'],
+        ['mmlu_pro_chemistry', 'extract_rate'],
+        ['mmlu_pro_law', 'extract_rate'],
+        ['mmlu_pro_engineering', 'extract_rate'],
+        ['mmlu_pro_other', 'extract_rate'],
+        ['mmlu_pro_economics', 'extract_rate'],
+        ['mmlu_pro_health', 'extract_rate'],
+        ['mmlu_pro_psychology', 'extract_rate'],
+        ['mmlu_pro_business', 'extract_rate'],
+        ['mmlu_pro_biology', 'extract_rate'],
+        ['mmlu_pro_philosophy', 'extract_rate'],
+        ['mmlu_pro_computer_science', 'extract_rate'],
+        ['mmlu_pro_history', 'extract_rate'],
+        '',
+        ['cmmlu', 'extract_rate'],
+        ['cmmlu-stem', 'extract_rate'],
+        ['cmmlu-social-science', 'extract_rate'],
+        ['cmmlu-humanities', 'extract_rate'],
+        ['cmmlu-other', 'extract_rate'],
+        ['cmmlu-china-specific', 'extract_rate'],
+
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0, # Modify if needed
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        task=dict(type=OpenICLEvalTask)),
+)
+
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench_v1_9/'
+work_dir = osp.join(base_exp_dir, 'chat_objective')
--- a/configs/eval_alaya.py
+++ b/configs/eval_alaya.py
@ -1,11 +1,11 @@
 from mmengine.config import read_base

 with read_base():
-    from .datasets.ceval.ceval_gen import ceval_datasets
-    from .datasets.cmmlu.cmmlu_gen import cmmlu_datasets
-    from .datasets.agieval.agieval_gen import agieval_datasets
-    from .datasets.bbh.bbh_gen import bbh_datasets
-    from .datasets.mmlu.mmlu_gen import mmlu_datasets
-    from .models.alaya.alaya import models
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_gen import cmmlu_datasets
+    from opencompass.configs.datasets.agieval.agieval_gen import agieval_datasets
+    from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_gen import mmlu_datasets
+    from opencompass.configs.models.alaya.alaya import models

 datasets = [*bbh_datasets, *ceval_datasets, *cmmlu_datasets, *agieval_datasets, *mmlu_datasets]
--- a/configs/eval_api_demo.py
+++ b/configs/eval_api_demo.py
@ -0,0 +1,9 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import gsm8k_datasets
+    from opencompass.configs.datasets.demo.demo_math_chat_gen import math_datasets
+    from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt4
+
+datasets = gsm8k_datasets + math_datasets
+models = gpt4
--- a/configs/eval_attack.py
+++ b/configs/eval_attack.py
@ -5,8 +5,8 @@ from opencompass.tasks import OpenICLAttackTask

 with read_base():
    # choose a list of datasets
-    from .datasets.promptbench.promptbench_wnli_gen_50662f import wnli_datasets
-    from .models.hf_vicuna_7b import models
+    from opencompass.configs.datasets.promptbench.promptbench_wnli_gen_50662f import wnli_datasets
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b import models

 datasets = wnli_datasets

--- a/configs/eval_base_demo.py
+++ b/configs/eval_base_demo.py
@ -1,10 +1,10 @@
 from mmengine.config import read_base

 with read_base():
-    from .datasets.demo.demo_gsm8k_base_gen import gsm8k_datasets
-    from .datasets.demo.demo_math_base_gen import math_datasets
-    from .models.qwen.hf_qwen2_1_5b import models as hf_qwen2_1_5b_models
-    from .models.hf_internlm.hf_internlm2_1_8b import models as hf_internlm2_1_8b_models
+    from opencompass.configs.datasets.demo.demo_gsm8k_base_gen import gsm8k_datasets
+    from opencompass.configs.datasets.demo.demo_math_base_gen import math_datasets
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b import models as hf_qwen2_1_5b_models
+    from opencompass.configs.models.hf_internlm.hf_internlm2_1_8b import models as hf_internlm2_1_8b_models

 datasets = gsm8k_datasets + math_datasets
 models = hf_qwen2_1_5b_models + hf_internlm2_1_8b_models
--- a/configs/eval_bluelm_32k_lveval.py
+++ b/configs/eval_bluelm_32k_lveval.py
@ -1,9 +1,9 @@
 from mmengine.config import read_base

 with read_base():
-    from .datasets.lveval.lveval import LVEval_datasets as datasets
-    from .models.bluelm.hf_bluelm_7b_chat_32k import models
-    from .summarizers.lveval import summarizer
+    from opencompass.configs.datasets.lveval.lveval import LVEval_datasets as datasets
+    from opencompass.configs.models.bluelm.hf_bluelm_7b_chat_32k import models
+    from opencompass.configs.summarizers.lveval import summarizer

 models[0][
    'path'
--- a/configs/eval_charm_mem.py
+++ b/configs/eval_charm_mem.py
@ -1,57 +1,57 @@
 from mmengine.config import read_base

-from opencompass.models import OpenAI
+from opencompassopencompass.configs.models import OpenAI
 from opencompass.runners import LocalRunner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import CharmMemSummarizer

 with read_base():
-    from .datasets.CHARM.charm_memory_gen_bbbd53 import charm_memory_datasets as datasets
+    from opencompass.configs.datasets.CHARM.charm_memory_gen_bbbd53 import charm_memory_datasets as datasets

    # ------>>>>>> https://arxiv.org/abs/2403.14112
-    # from .models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
-    # from .models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
-    # from .models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
-    # from .models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
-    # from .models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
-    # from .models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
-    # from .models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
-    # from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
-    # from .models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model  # need torch 2.1
-    # from .models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model  # need torch 2.1
-    # from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
-    # from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
-    # from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
-    # from .models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
-    # from .models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
-    # from .models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
-    # from .models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
-    # from .models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
-    # from .models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
+    # from opencompass.configs.models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
+    # from opencompass.configs.models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
+    # from opencompass.configs.models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
+    # from opencompass.configs.models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
+    # from opencompass.configs.models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
+    # from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model  # need torch 2.1
+    # from opencompass.configs.models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model  # need torch 2.1
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
+    # from opencompass.configs.models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
    # <<<<<<------ https://arxiv.org/abs/2403.14112

-    # from .models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
-    # from .models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
-    # from .models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
-    # from .models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
+    # from opencompass.configs.models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
+    # from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
+    # from opencompass.configs.models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
+    # from opencompass.configs.models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model

-    # from .models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
-    # from .models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
+    # from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
+    # from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model

-    # from .models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
-    # from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
-    # from .models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model

-    # from .models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
-    # from .models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model

-    # from .models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model

-    # from .models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
-    # from .models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
-    # from .models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
-    # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model

 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

--- a/configs/eval_charm_rea.py
+++ b/configs/eval_charm_rea.py
@ -1,51 +1,51 @@
 from mmengine.config import read_base

 with read_base():
-    from .datasets.CHARM.charm_reason_gen_f8fca2 import charm_reason_datasets as datasets
+    from opencompass.configs.datasets.CHARM.charm_reason_gen_f8fca2 import charm_reason_datasets as datasets

    # ------>>>>>> https://arxiv.org/abs/2403.14112
-    # from .models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
-    # from .models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
-    # from .models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
-    # from .models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
-    # from .models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
-    # from .models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
-    # from .models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
-    # from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
-    # from .models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model  # need torch 2.1
-    # from .models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model  # need torch 2.1
-    # from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
-    # from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
-    # from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
-    # from .models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
-    # from .models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
-    # from .models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
-    # from .models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
-    # from .models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
-    # from .models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
+    # from opencompass.configs.models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
+    # from opencompass.configs.models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
+    # from opencompass.configs.models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
+    # from opencompass.configs.models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
+    # from opencompass.configs.models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
+    # from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model  # need torch 2.1
+    # from opencompass.configs.models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model  # need torch 2.1
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
+    # from opencompass.configs.models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
    # <<<<<<------ https://arxiv.org/abs/2403.14112

-    # from .models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
-    # from .models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
-    # from .models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
-    # from .models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
+    # from opencompass.configs.models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
+    # from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
+    # from opencompass.configs.models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
+    # from opencompass.configs.models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model

-    # from .models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
-    # from .models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
+    # from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
+    # from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model

-    # from .models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
-    # from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
-    # from .models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model

-    # from .models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
-    # from .models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model

-    # from .models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model

-    # from .models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
-    # from .models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
-    # from .models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
-    # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model

    from .summarizers.charm_reason import summarizer

--- a/configs/eval_chat_agent.py
+++ b/configs/eval_chat_agent.py
@ -9,10 +9,10 @@ from lagent import ReAct
 from lagent.agents.react import ReActProtocol

 with read_base():
-    from .datasets.gsm8k.gsm8k_agent_gen_be1606 import gsm8k_datasets
-    from .datasets.math.math_agent_gen_af2293 import math_datasets
-    from .datasets.MathBench.mathbench_agent_gen_568903 import mathbench_agent_datasets
-    from .summarizers.math_agent import summarizer
+    from opencompass.configs.datasets.gsm8k.gsm8k_agent_gen_be1606 import gsm8k_datasets
+    from opencompass.configs.datasets.math.math_agent_gen_af2293 import math_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_agent_gen_568903 import mathbench_agent_datasets
+    from opencompass.configs.summarizers.math_agent import summarizer

 datasets = []
 datasets += gsm8k_datasets
--- a/configs/eval_chat_agent_baseline.py
+++ b/configs/eval_chat_agent_baseline.py
@ -5,10 +5,10 @@ from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    from .datasets.gsm8k.gsm8k_gen_d6de81 import gsm8k_datasets
-    from .datasets.math.math_gen_1ed9c2 import math_datasets
-    from .datasets.MathBench.mathbench_gen import mathbench_datasets
-    from .summarizers.math_baseline import summarizer
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_d6de81 import gsm8k_datasets
+    from opencompass.configs.datasets.math.math_gen_1ed9c2 import math_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_gen import mathbench_datasets
+    from opencompass.configs.summarizers.math_baseline import summarizer

 datasets = []
 datasets += gsm8k_datasets
--- a/configs/eval_chat_cibench_api.py
+++ b/configs/eval_chat_cibench_api.py
@ -1,96 +0,0 @@
-from mmengine.config import read_base
-
-from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
-from opencompass.lagent.agents.react import CIReAct, ReActProtocol
-from opencompass.models.lagent import CodeAgent
-from opencompass.models.openai_api import OpenAI
-from opencompass.partitioners import SizePartitioner
-from opencompass.runners import LocalRunner
-from opencompass.tasks import OpenICLInferTask
-
-with read_base():
-    from .datasets.CIBench.CIBench_template_gen_e6b12a import \
-        cibench_datasets as datasets
-
-FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
-
-FEWSHOT_INSTRUCTION = """\
-You are an assistant who can utilize external tools.
-{tool_description}
-To use a tool, please response with the following format:
-```
-{thought} Think what you need to solve, do you need to use tools?
-{action} The tool name, should be one of [{action_names}].
-{action_input} The input to the tool that you want to use.
-```
-The tool will give you response after your response using the following format:
-```
-{response} the results after call the tool.
-```
-Therefore DO NOT generate tool response by yourself.
-
-Also please follow the guidelines:
-1. Always use code interpreter to solve the problem.
-2. The generated codes should always in a markdown code block format.
-3. The generated codes will be executed in an ipython manner and the results will be cached.
-4. Your responded code should always be simple and only solves the problem in current step.
-
-For example:
-
-File url: `xxxx`
-### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
-
-{thought} We should use `pandas` to solve this step.
-{action} IPythonInterpreter
-{action_input} ```python
-import pandas as pd
-url = "xxxx"
-data = pd.read_csv(url)
-```
-{response} The code is succeed without any outputs.
-
-Let us begin from here!
-"""
-
-IPYTHON_INTERPRETER_DESCRIPTION = '''\
-It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
-
-models = [
-    dict(
-        abbr='gpt-3.5-code',
-        type=CodeAgent,
-        agent_type=CIReAct,
-        max_turn=3,
-        llm=dict(
-            type=OpenAI,
-            path='gpt-3.5-turbo',
-            key='ENV',
-            query_per_second=1,
-            max_seq_len=4096,
-        ),
-        actions=[
-            dict(type=IPythonInterpreter,
-                 description=IPYTHON_INTERPRETER_DESCRIPTION,
-                 user_data_dir='./data/cibench_dataset/datasources')
-        ],
-        protocol=dict(
-            type=ReActProtocol,
-            call_protocol=FEWSHOT_INSTRUCTION,
-            force_stop=FORCE_STOP_PROMPT_EN,
-            finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
-        ),
-        batch_size=1,
-        use_system_role=False, # use `user` role instead of system role
-        first_system_role=False, # use `user` role of the first instruction prompt
-        merge_adjacent_role=True, # merge adjacent same user content
-    ),
-]
-
-
-infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=1000),
-    runner=dict(
-        type=LocalRunner,
-        max_num_workers=16,
-        task=dict(type=OpenICLInferTask)),
-)
--- a/configs/eval_chat_demo.py
+++ b/configs/eval_chat_demo.py
@ -1,10 +1,10 @@
 from mmengine.config import read_base

 with read_base():
-    from .datasets.demo.demo_gsm8k_chat_gen import gsm8k_datasets
-    from .datasets.demo.demo_math_chat_gen import math_datasets
-    from .models.qwen.hf_qwen2_1_5b_instruct import models as hf_qwen2_1_5b_instruct_models
-    from .models.hf_internlm.hf_internlm2_chat_1_8b import models as hf_internlm2_chat_1_8b_models
+    from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import gsm8k_datasets
+    from opencompass.configs.datasets.demo.demo_math_chat_gen import math_datasets
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import models as hf_qwen2_1_5b_instruct_models
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_1_8b import models as hf_internlm2_chat_1_8b_models

 datasets = gsm8k_datasets + math_datasets
 models = hf_qwen2_1_5b_instruct_models + hf_internlm2_chat_1_8b_models
--- a/configs/eval_chat_last.py
+++ b/configs/eval_chat_last.py
@ -7,7 +7,7 @@ from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets as datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets as datasets

 models = [
    dict(
--- a/configs/eval_chembench.py
+++ b/configs/eval_chembench.py
@ -1,8 +1,8 @@
 from mmengine.config import read_base

 with read_base():
-    from .datasets.ChemBench.ChemBench_gen import chembench_datasets
-    from .models.mistral.hf_mistral_7b_instruct_v0_2 import models
+    from opencompass.configs.datasets.ChemBench.ChemBench_gen import chembench_datasets
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import models

 datasets = [*chembench_datasets]
 models = [*models]
--- a/configs/eval_cibench.py
+++ b/configs/eval_cibench.py
@ -1,15 +1,37 @@
+from copy import deepcopy
 from mmengine.config import read_base
-from opencompass.partitioners import SizePartitioner
-from opencompass.runners import LocalRunner, SlurmRunner
-from opencompass.tasks import OpenICLInferTask
-from opencompass.models import OpenAI
+from opencompass.models.lagent import LagentAgent
+from lagent import ReAct
+from lagent.agents.react import ReActProtocol
+from opencompass.models.lagent import CodeAgent
+from opencompass.lagent.actions.python_interpreter import PythonInterpreter
 from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
 from opencompass.lagent.agents.react import CIReAct
-from opencompass.models.lagent import CodeAgent
-from lagent.agents.react import ReActProtocol
+from opencompass.models import HuggingFaceCausalLM
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.partitioners import NaivePartitioner

 with read_base():
-    from .datasets.CIBench.CIBench_gen_eb42f9 import cibench_datasets as datasets
+    # Note that it might occur cuda OOM error for hf model
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
+
+    from opencompass.configs.summarizers.cibench import summarizer
+    from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import cibench_datasets as cibench_datasets_template
+    from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import cibench_datasets as cibench_datasets_generation
+    # Oracle mode for analysis
+    # from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle
+    # from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle
+
+datasets = []
+datasets += cibench_datasets_template
+datasets += cibench_datasets_generation
+# datasets += cibench_datasets_template_oracle
+# datasets += cibench_datasets_generation_oracle
+
+_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

 FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""

@ -34,47 +56,87 @@ Also please follow the guidelines:
 3. The generated codes will be executed in an ipython manner and the results will be cached.
 4. Your responded code should always be simple and only solves the problem in current step.

-Begin!
+For example:
+
+File url: `xxxx`
+### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
+
+{thought} We should use `pandas` to solve this step.
+{action} IPythonInterpreter
+{action_input} ```python
+import pandas as pd
+url = "xxxx"
+data = pd.read_csv(url)
+```
+{response} The code is succeed without any outputs.
+
+Let us begin from here!
 """

-models = [
-    dict(
-        abbr='gpt-3.5-turbo',
-        type=CodeAgent,
-        agent_type=CIReAct,
-        mutli_rounds=True,
-        max_turn=3,
-        llm=dict(
-            type=OpenAI,
-            path='gpt-3.5-turbo',
-            key='ENV',
-            query_per_second=1,
-            max_seq_len=4096,
-        ),
-        actions=[
-            dict(
-                type=IPythonInterpreter,
-                description=
-                '''It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.
-'''),
-        ],
-        protocol=dict(
+IPYTHON_INTERPRETER_DESCRIPTION = '''\
+It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
+
+
+
+actions=[dict(type=IPythonInterpreter, user_data_dir='./data/cibench_dataset/datasources',
+                 description=IPYTHON_INTERPRETER_DESCRIPTION)]
+protocol=dict(
            type=ReActProtocol,
            call_protocol=FEWSHOT_INSTRUCTION,
            force_stop=FORCE_STOP_PROMPT_EN,
-            action=dict(role='ACTION', begin='Tool:', end='\n'),
-            action_input=dict(role='ARGS', begin='Tool Input:', end='\n'),
-            response=dict(role='RESPONSE', begin='Tool Response:', end='\n'),
            finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
-        ),
-        batch_size=8,
-    ),
-]
+        )

+work_dir = './outputs/cibench/'
+
+_agent_models = []
+for m in _origin_models:
+    m = deepcopy(m)
+    if 'meta_template' in m and 'round' in m['meta_template']:
+        round = m['meta_template']['round']
+        if all(r['role'].upper() != 'SYSTEM' for r in round):  # no system round
+            if not any('api_role' in r for r in round):
+                m['meta_template']['round'].append(dict(role='system', begin='System response:', end='\n'))
+            else:
+                m['meta_template']['round'].append(dict(role='system', api_role='SYSTEM'))
+            print(f'WARNING: adding SYSTEM round in meta_template for {m.get("abbr", None)}')
+    _agent_models.append(m)
+
+protocol=dict(
+    type=ReActProtocol,
+    call_protocol=FEWSHOT_INSTRUCTION,
+    force_stop=FORCE_STOP_PROMPT_EN,
+    finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
+)
+
+models = []
+for m in _agent_models:
+    m = deepcopy(m)
+    origin_abbr = m.pop('abbr')
+    abbr = origin_abbr
+    m.pop('batch_size', None)
+    m.pop('max_out_len', None)
+    m.pop('max_seq_len', None)
+    run_cfg = m.pop('run_cfg', {})
+
+    agent_model = dict(
+        abbr=abbr,
+        summarizer_abbr=origin_abbr,
+        type=CodeAgent,
+        agent_type=CIReAct,
+        max_turn=3,
+        llm=m,
+        actions=[dict(type=IPythonInterpreter, user_data_dir='./data/cibench_dataset/datasources', description=IPYTHON_INTERPRETER_DESCRIPTION)],
+        protocol=protocol,
+        batch_size=1,
+        run_cfg=run_cfg,
+    )
+    models.append(agent_model)

 infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=50, gen_task_coef=1),
+    partitioner=dict(type=NaivePartitioner),
    runner=dict(
-        type=SlurmRunner, max_num_workers=8, retry=2,
+        type=LocalRunner,
+        max_num_workers=4,
        task=dict(type=OpenICLInferTask)),
 )
--- a/Show More
+++ b/Show More