Merge branch 'open-compass:main' into main

2025-05-30 16:03:24 +08:00 · 2024-11-25 10:14:43 +08:00 · 2024-11-25 10:14:43 +08:00 · 64a34bccaf
commit 64a34bccaf
parent ceffb11a87 80e3b9ef37
60 changed files with 5191 additions and 558 deletions
--- a/.github/scripts/eval_regression_api.py
+++ b/.github/scripts/eval_regression_api.py
@ -0,0 +1,39 @@
+from mmengine.config import read_base
+
+from opencompass.models.openai_api import OpenAISDK
+
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
+        gsm8k_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.race.race_gen import \
+        race_datasets  # noqa: F401, E501
+
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+models = [
+    dict(
+        abbr='lmdeploy-api-test',
+        type=OpenAISDK,
+        key='EMPTY',
+        openai_api_base='http://localhost:23333/v1',
+        path='internlm2',
+        tokenizer_path='internlm/internlm2_5-7b-chat',
+        rpm_verbose=True,
+        meta_template=api_meta_template,
+        query_per_second=128,
+        max_out_len=1024,
+        max_seq_len=4096,
+        temperature=0.01,
+        batch_size=128,
+        retry=20,
+    )
+]
--- a/.github/scripts/eval_regression_base.py
+++ b/.github/scripts/eval_regression_base.py
@ -2,15 +2,21 @@ from mmengine.config import read_base

 with read_base():
    # choose a list of datasets
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
+        gpqa_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
        gsm8k_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.race.race_ppl import \
        race_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
+        winogrande_datasets  # noqa: F401, E501
+    # read hf models - chat models
+    from opencompass.configs.models.chatglm.hf_glm4_9b import \
+        models as hf_glm4_9b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
        models as hf_deepseek_v2_lite_model  # noqa: F401, E501
-    # read hf models - chat models
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
@ -19,34 +25,58 @@ with read_base():
        models as hf_gemma2_2b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_9b import \
        models as hf_gemma2_9b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma_2b import \
+        models as hf_gemma_2b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma_7b import \
+        models as hf_gemma_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.vllm_gemma_2b import \
+        models as vllm_gemma_2b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.vllm_gemma_7b import \
+        models as vllm_gemma_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
        models as hf_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
        models as hf_internlm2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
+        models as hf_internlm2_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
        models as hf_internlm2_base_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
+        models as hf_internlm2_base_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
        models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
        models as lmdeploy_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \
        models as lmdeploy_internlm2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import \
+        models as lmdeploy_internlm2_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
        models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama2_7b import \
        models as hf_llama2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
+        models as hf_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
        models as hf_llama3_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
        models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
        models as lmdeploy_llama3_8b_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
+        models as hf_mistral_7b_v0_2_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
        models as hf_mistral_7b_v0_3_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
        models as vllm_mistral_7b_v0_2_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \
-        models as vllm_mixtral_8x7b_v0_1_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \
+        models as hf_qwen_2_5_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
+        models as hf_qwen_2_5_14b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
+        models as lmdeploy_qwen2_5_1_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
+        models as lmdeploy_qwen2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
        models as hf_qwen1_5_moe_a2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
@ -65,11 +95,27 @@ with read_base():
        models as hf_yi_1_5_6b_model  # noqa: F401, E501
    from opencompass.configs.models.yi.hf_yi_1_5_9b import \
        models as hf_yi_1_5_9b_model  # noqa: F401, E501
-    from opencompass.configs.summarizers.medium import \
-        summarizer  # noqa: F401, E501

+race_datasets = [race_datasets[1]]
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

 for d in datasets:
-    d['reader_cfg']['test_range'] = '[0:100]'
+    d['reader_cfg']['test_range'] = '[0:32]'
+
+for m in models:
+    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
+        m['engine_config']['max_batch_size'] = 1
+        m['batch_size'] = 1
+models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['gsm8k', 'accuracy'],
+        ['GPQA_diamond', 'accuracy'],
+        ['race-high', 'accuracy'],
+        ['winogrande', 'accuracy'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
--- a/.github/scripts/eval_regression_base_fullbench.py
+++ b/.github/scripts/eval_regression_base_fullbench.py
@ -0,0 +1,188 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \
+        ARC_c_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
+        bbh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
+        cmmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.dingo.dingo_gen import \
+        datasets as dingo_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.drop.drop_gen_a2697c import \
+        drop_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
+        GaokaoBench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \
+        gpqa_datasets  # noqa: F401, E501
+    # Corebench v1.7
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
+        gsm8k_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
+        hellaswag_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \
+        humaneval_datasets as humaneval_v2_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \
+        humaneval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
+        math_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
+        mathbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
+        sanitized_mbpp_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
+        mmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
+        mmlu_pro_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \
+        nq_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.race.race_few_shot_ppl import \
+        race_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \
+        BoolQ_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
+        TheoremQA_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
+        triviaqa_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \
+        wikibench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
+        winogrande_datasets  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
+        models as hf_internlm2_5_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
+        models as lmdeploy_internlm2_5_7b_model  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.bbh import \
+        bbh_summary_groups  # noqa: F401, E501
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.GaokaoBench import \
+        GaokaoBench_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu import \
+        mmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups  # noqa: F401, E501
+
+race_datasets = [race_datasets[1]]  # Only take RACE-High
+humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
+bbh_datasets = [
+    x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
+    or 'multistep_arithmetic_two' in x['abbr']
+]
+cmmlu_datasets = [
+    x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
+        'ancient_chinese', 'chinese_civil_service_exam',
+        'chinese_driving_rule', 'chinese_food_culture',
+        'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
+        'chinese_teacher_qualification', 'construction_project_management',
+        'elementary_chinese', 'elementary_commonsense', 'ethnology',
+        'high_school_politics', 'modern_chinese',
+        'traditional_chinese_medicine'
+    ]
+]
+mmlu_datasets = [
+    x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
+        'business_ethics', 'clinical_knowledge', 'college_medicine',
+        'global_facts', 'human_aging', 'management', 'marketing',
+        'medical_genetics', 'miscellaneous', 'nutrition',
+        'professional_accounting', 'professional_medicine', 'virology'
+    ]
+]
+mmlu_pro_datasets = [mmlu_pro_datasets[0]]
+mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
+GaokaoBench_datasets = [
+    x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
+    or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
+]
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['race-high', 'accuracy'],
+        ['ARC-c', 'accuracy'],
+        ['BoolQ', 'accuracy'],
+        ['mmlu_pro', 'naive_average'],
+        ['GPQA_diamond', 'accuracy'],
+        ['cmmlu', 'naive_average'],
+        ['mmlu', 'naive_average'],
+        ['drop', 'accuracy'],
+        ['bbh', 'naive_average'],
+        ['math', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['openai_humaneval_v2', 'humaneval_pass@1'],
+        ['sanitized_mbpp', 'score'],
+        ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
+        ['gsm8k', 'accuracy'],
+        ['GaokaoBench', 'weighted_average'],
+        ['triviaqa_wiki_1shot', 'score'],
+        ['nq_open_1shot', 'score'],
+        ['winogrande', 'accuracy'],
+        ['hellaswag', 'accuracy'],
+        ['TheoremQA', 'score'],
+        ['dingo_en_192', 'score'],
+        ['dingo_zh_170', 'score'],
+        '###### MathBench-A: Application Part ######',
+        'college',
+        'high',
+        'middle',
+        'primary',
+        'arithmetic',
+        'mathbench-a (average)',
+        '###### MathBench-T: Theory Part ######',
+        'college_knowledge',
+        'high_knowledge',
+        'middle_knowledge',
+        'primary_knowledge',
+        'mathbench-t (average)',
+        '###### Overall: Average between MathBench-A and MathBench-T ######',
+        'Overall',
+        '',
+        'bbh-logical_deduction_seven_objects',
+        'bbh-multistep_arithmetic_two',
+        '',
+        'mmlu',
+        'mmlu-stem',
+        'mmlu-social-science',
+        'mmlu-humanities',
+        ['mmlu-other', 'accuracy'],
+        'cmmlu',
+        'cmmlu-stem',
+        'cmmlu-social-science',
+        'cmmlu-humanities',
+        'cmmlu-other',
+        ['cmmlu-china-specific', 'accuracy'],
+        'mmlu_pro',
+        'mmlu_pro_biology',
+        'mmlu_pro_business',
+        'mmlu_pro_chemistry',
+        'mmlu_pro_computer_science',
+        'mmlu_pro_economics',
+        'mmlu_pro_engineering',
+        'mmlu_pro_health',
+        'mmlu_pro_history',
+        'mmlu_pro_law',
+        'mmlu_pro_math',
+        'mmlu_pro_philosophy',
+        'mmlu_pro_physics',
+        'mmlu_pro_psychology',
+        'mmlu_pro_other',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+
+for d in datasets:
+    d['reader_cfg']['test_range'] = '[0:16]'
+
+for m in models:
+    m['abbr'] = m['abbr'] + '_fullbench'
+    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
+        m['engine_config']['max_batch_size'] = 1
+        m['batch_size'] = 1
+models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
--- a/.github/scripts/eval_regression_chat.py
+++ b/.github/scripts/eval_regression_chat.py
@ -1,7 +1,5 @@
 from mmengine.config import read_base

-from opencompass.models import OpenAISDK
-
 with read_base():
    # choose a list of datasets
    from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
@ -29,6 +27,12 @@ with read_base():
        models as hf_gemma2_2b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
        models as hf_gemma2_9b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma_2b_it import \
+        models as hf_gemma_2b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma_7b_it import \
+        models as hf_gemma_7b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
+        models as lmdeploy_gemma_9b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
        models as vllm_gemma_7b_it_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
@ -51,18 +55,37 @@ with read_base():
        models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
        models as hf_llama3_1_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_2_3b_instruct import \
+        models as hf_llama3_2_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
        models as hf_llama3_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \
+        models as lmdeploy_llama2_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
        models as lmdeploy_llama3_1_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
+        models as lmdeploy_llama3_2_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
        models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
+        models as hf_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
        models as hf_mistral_7b_instruct_v0_3_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_nemo_instruct_2407 import \
+        models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
+        models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
+        models as lmdeploy_mistral_nemo_instruct_2407_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
+        models as \
+        lmdeploy_mistral_small_instruct_2409_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
+        models as vllm_mistral_7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
        models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \
-        models as vllm_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
+    from opencompass.configs.models.openbmb.hf_minicpm3_4b import \
+        models as hf_minicpm3_4b_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
        models as hf_minicpm_2b_dpo_fp32_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
@ -73,6 +96,10 @@ with read_base():
        models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
        models as hf_phi_3_mini_8k_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \
+        models as hf_qwen2_5_14b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
+        models as lmdeploy_qwen2_5_14b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
        models as hf_qwen1_5_0_5b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
@ -89,10 +116,8 @@ with read_base():
        models as hf_yi_1_5_6b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
        models as hf_yi_1_5_9b_chat_model  # noqa: F401, E501
-    from opencompass.configs.summarizers.medium import \
-        summarizer  # noqa: F401, E501

-models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+race_datasets = [race_datasets[1]]
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

 api_meta_template = dict(
@ -103,25 +128,24 @@ api_meta_template = dict(
    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )

-model_name = ''
-
-models.append(
-    dict(
-        abbr='lmdeploy-api-test',
-        type=OpenAISDK,
-        key='EMPTY',
-        openai_api_base='http://judgemodel:10001/v1',
-        path='compass_judger_internlm2_102b_0508',
-        tokenizer_path='internlm/internlm2_5-20b-chat',
-        rpm_verbose=True,
-        meta_template=api_meta_template,
-        query_per_second=50,
-        max_out_len=1024,
-        max_seq_len=4096,
-        temperature=0.01,
-        batch_size=128,
-        retry=3,
-    ))
-
 for d in datasets:
-    d['reader_cfg']['test_range'] = '[0:100]'
+    d['reader_cfg']['test_range'] = '[0:32]'
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+for m in models:
+    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
+        m['engine_config']['max_batch_size'] = 1
+        m['batch_size'] = 1
+
+models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
+
+summarizer = dict(
+    dataset_abbrs=[
+        'gsm8k',
+        'race-middle',
+        'race-high',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
--- a/.github/scripts/eval_regression_chat_objective_fullbench.py
+++ b/.github/scripts/eval_regression_chat_objective_fullbench.py
@ -0,0 +1,300 @@
+from mmengine.config import read_base
+
+with read_base():
+    # read hf models - chat models
+    # Dataset
+    from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \
+        aime2024_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
+        ARC_c_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
+        bbh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
+        cmmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
+        cmo_fib_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
+        drop_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \
+        ds1000_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
+        GaokaoBench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
+        gpqa_datasets  # noqa: F401, E501
+    # new datasets in Fullbench v1.1
+    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
+        gsm8k_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
+        hellaswag_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \
+        humaneval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \
+        humanevalx_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
+        ifeval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
+        LCB_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
+        math_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
+        mathbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
+        sanitized_mbpp_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
+        mmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
+        mmlu_pro_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \
+        mmmlu_lite_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.musr.musr_gen_3c6e15 import \
+        musr_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
+        nq_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
+        race_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \
+        SciCode_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \
+        BoolQ_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
+        teval_datasets as teval_en_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
+        teval_datasets as teval_zh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
+        TheoremQA_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
+        triviaqa_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \
+        wikibench_datasets  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
+        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.bbh import \
+        bbh_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.ds1000 import \
+        ds1000_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.GaokaoBench import \
+        GaokaoBench_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.humanevalx import \
+        humanevalx_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu import \
+        mmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.musr_average import \
+        summarizer as musr_summarizer  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.scicode import \
+        scicode_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.teval import \
+        teval_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.mmmlu_lite import \
+        mmmlu_summary_groups  # noqa: F401, E501
+
+# For HumanEval-X Evaluation
+# Apply the evaluator ip_address and port
+race_datasets = [race_datasets[1]]
+for item in humanevalx_datasets:
+    item['eval_cfg']['evaluator'][
+        'ip_address'] = 'codeeval.opencompass.org.cn/humanevalx'
+    item['eval_cfg']['evaluator']['port'] = ''
+
+# For DS-1000 Evaluation
+# Apply the evaluator ip_address and port
+for item in ds1000_datasets:
+    item['eval_cfg']['evaluator'][
+        'ip_address'] = 'codeeval.opencompass.org.cn/ds1000'
+    item['eval_cfg']['evaluator']['port'] = ''
+
+bbh_datasets = [
+    x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
+    or 'multistep_arithmetic_two' in x['abbr']
+]
+cmmlu_datasets = [
+    x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
+        'ancient_chinese', 'chinese_civil_service_exam',
+        'chinese_driving_rule', 'chinese_food_culture',
+        'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
+        'chinese_teacher_qualification', 'construction_project_management',
+        'elementary_chinese', 'elementary_commonsense', 'ethnology',
+        'high_school_politics', 'modern_chinese',
+        'traditional_chinese_medicine'
+    ]
+]
+mmlu_datasets = [
+    x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
+        'business_ethics', 'clinical_knowledge', 'college_medicine',
+        'global_facts', 'human_aging', 'management', 'marketing',
+        'medical_genetics', 'miscellaneous', 'nutrition',
+        'professional_accounting', 'professional_medicine', 'virology'
+    ]
+]
+
+mmlu_pro_datasets = [mmlu_pro_datasets[0]]
+
+mmmlu_lite_datasets = [
+    x for x in mmmlu_lite_datasets if 'mmlu_lite_AR-XY' in x['abbr']
+]
+mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
+GaokaoBench_datasets = [
+    x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
+    or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
+]
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')
+     and 'scicode' not in k.lower() and 'teval' not in k),
+    [],
+)
+datasets += teval_en_datasets
+datasets += teval_zh_datasets
+# datasets += SciCode_datasets
+
+musr_summary_groups = musr_summarizer['summary_groups']
+summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
+summary_groups.append(
+    {
+        'name': 'Mathbench',
+        'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
+    }, )
+
+# Summarizer
+summarizer = dict(
+    dataset_abbrs=[
+        'Language',
+        ['race-high', 'accuracy'],
+        ['ARC-c', 'accuracy'],
+        ['BoolQ', 'accuracy'],
+        ['triviaqa_wiki_1shot', 'score'],
+        ['nq_open_1shot', 'score'],
+        ['mmmlu_lite', 'naive_average'],
+        '',
+        'Instruction Following',
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        '',
+        'General Reasoning',
+        ['drop', 'accuracy'],
+        ['bbh', 'naive_average'],
+        ['GPQA_diamond', 'accuracy'],
+        ['hellaswag', 'accuracy'],
+        ['TheoremQA', 'score'],
+        ['musr_average', 'naive_average'],
+        '',
+        'Math Calculation',
+        ['gsm8k', 'accuracy'],
+        ['GaokaoBench', 'weighted_average'],
+        ['math', 'accuracy'],
+        ['cmo_fib', 'accuracy'],
+        ['aime2024', 'accuracy'],
+        ['Mathbench', 'naive_average'],
+        '',
+        'Knowledge',
+        ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
+        ['cmmlu', 'naive_average'],
+        ['mmlu', 'naive_average'],
+        ['mmlu_pro', 'naive_average'],
+        '',
+        'Code',
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['sanitized_mbpp', 'score'],
+        ['humanevalx', 'naive_average'],
+        ['ds1000', 'naive_average'],
+        ['lcb_code_generation', 'pass@1'],
+        ['lcb_code_execution', 'pass@1'],
+        ['lcb_test_output', 'pass@1'],
+        '',
+        'Agent',
+        ['teval', 'naive_average'],
+        ['SciCode', 'accuracy'],
+        ['SciCode', 'sub_accuracy'],
+        '',
+        'bbh-logical_deduction_seven_objects',
+        'bbh-multistep_arithmetic_two',
+        '',
+        'mmlu',
+        'mmlu-stem',
+        'mmlu-social-science',
+        'mmlu-humanities',
+        'mmlu-other',
+        '',
+        'cmmlu',
+        'cmmlu-stem',
+        'cmmlu-social-science',
+        'cmmlu-humanities',
+        'cmmlu-other',
+        'cmmlu-china-specific',
+        '',
+        'mmlu_pro',
+        'mmlu_pro_biology',
+        'mmlu_pro_business',
+        'mmlu_pro_chemistry',
+        'mmlu_pro_computer_science',
+        'mmlu_pro_economics',
+        'mmlu_pro_engineering',
+        'mmlu_pro_health',
+        'mmlu_pro_history',
+        'mmlu_pro_law',
+        'mmlu_pro_math',
+        'mmlu_pro_philosophy',
+        'mmlu_pro_physics',
+        'mmlu_pro_psychology',
+        'mmlu_pro_other',
+        '',
+        'ds1000_Pandas',
+        'ds1000_Numpy',
+        'ds1000_Tensorflow',
+        'ds1000_Scipy',
+        'ds1000_Sklearn',
+        'ds1000_Pytorch',
+        'ds1000_Matplotlib',
+        '',
+        'mmmlu_lite',
+        'openai_mmmlu_lite_AR-XY',
+        'openai_mmmlu_lite_BN-BD',
+        'openai_mmmlu_lite_DE-DE',
+        'openai_mmmlu_lite_ES-LA',
+        'openai_mmmlu_lite_FR-FR',
+        'openai_mmmlu_lite_HI-IN',
+        'openai_mmmlu_lite_ID-ID',
+        'openai_mmmlu_lite_IT-IT',
+        'openai_mmmlu_lite_JA-JP',
+        'openai_mmmlu_lite_KO-KR',
+        'openai_mmmlu_lite_PT-BR',
+        'openai_mmmlu_lite_SW-KE',
+        'openai_mmmlu_lite_YO-NG',
+        'openai_mmmlu_lite_ZH-CN',
+        '',
+        '###### MathBench-A: Application Part ######',
+        'college',
+        'high',
+        'middle',
+        'primary',
+        'arithmetic',
+        'mathbench-a (average)',
+        '###### MathBench-T: Theory Part ######',
+        'college_knowledge',
+        'high_knowledge',
+        'middle_knowledge',
+        'primary_knowledge',
+        'mathbench-t (average)',
+    ],
+    summary_groups=summary_groups,
+)
+
+for d in datasets:
+    d['reader_cfg']['test_range'] = '[0:16]'
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+for m in models:
+    m['abbr'] = m['abbr'] + '_fullbench'
+    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
+        m['engine_config']['max_batch_size'] = 1
+        m['batch_size'] = 1
+
+models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
--- a/.github/scripts/eval_regression_chat_subjective_fullbench.py
+++ b/.github/scripts/eval_regression_chat_subjective_fullbench.py
@ -0,0 +1,70 @@
+from copy import deepcopy
+
+from mmengine.config import read_base
+
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.summarizers import SubjectiveSummarizer
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+with read_base():
+    # read hf models - chat models
+    # Dataset
+    from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
+        alignbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
+        alpacav2_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
+        arenahard_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
+        compassarena_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
+        fofo_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \
+        followbench_llmeval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
+        mtbench101_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
+        wildbench_datasets  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
+        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
+
+summarizer = dict(type=SubjectiveSummarizer, function='subjective')
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
+                and 'mtbench101' not in k and 'wildbench' not in k), [])
+datasets += mtbench101_datasets  # noqa: F401, E501
+datasets += wildbench_datasets  # noqa: F401, E501
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+for m in models:
+    m['abbr'] = m['abbr'] + '_fullbench'
+    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
+        m['engine_config']['max_batch_size'] = 1
+        m['batch_size'] = 1
+
+models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
+
+judge_models = deepcopy([models[1]])
+judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
+
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=SubjectiveEvalTask)),
+)
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -7,36 +7,56 @@ import yaml
 output_path = 'regression_result_daily'

 chat_model_list = [
-    'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
-    'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf',
-    'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
-    'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind',
-    'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
+    'baichuan2-7b-chat-hf', 'glm-4-9b-chat-hf', 'glm-4-9b-chat-turbomind',
+    'glm-4-9b-chat-vllm', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
+    'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf',
+    'gemma-2b-it-hf', 'gemma-7b-it-hf', 'gemma-2-9b-it-turbomind',
+    'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
+    'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
    'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-lmdeploy',
    'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm',
-    'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf',
-    'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
-    'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm',
-    'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
+    'llama-3_1-8b-instruct-hf', 'llama-3_2-3b-instruct-hf',
+    'llama-3-8b-instruct-hf', 'llama-3_1-8b-instruct-turbomind',
+    'llama-3_2-3b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
+    'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.3-hf',
+    'mistral-nemo-instruct-2407-hf', 'mistral-nemo-instruct-2407-turbomind',
+    'mistral-7b-instruct-v0.1-vllm', 'mistral-7b-instruct-v0.2-vllm',
+    'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
    'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
    'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
    'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
    'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
-    'lmdeploy-api-test'
+    'deepseek-v2-lite-chat-hf', 'internlm2_5-20b-chat-hf',
+    'internlm2_5-20b-chat-turbomind', 'mistral-small-instruct-2409-hf',
+    'mistral-small-instruct-2409-turbomind', 'qwen2.5-14b-instruct-hf',
+    'qwen2.5-14b-instruct-turbomind'
 ]
 base_model_list = [
-    'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf',
-    'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf',
-    'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf',
-    'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind',
-    'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind',
-    'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf',
-    'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf',
-    'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
+    'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
+    'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf',
+    'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm',
+    'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
+    'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind',
+    'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf',
+    'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind',
+    'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf',
+    'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind',
+    'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
    'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
-    'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
+    'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf',
+    'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf',
+    'internlm2-20b-turbomind', 'qwen2.5-14b-hf'
 ]
-dataset_list = ['gsm8k', 'race-middle', 'race-high']
+
+
+@pytest.fixture()
+def baseline_scores_testrange(request):
+    config_path = os.path.join(
+        request.config.rootdir,
+        '.github/scripts/oc_score_baseline_testrange.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    return config


@pytest.fixture()
@ -48,6 +68,16 @@ def baseline_scores(request):
    return config


+@pytest.fixture()
+def baseline_scores_fullbench(request):
+    config_path = os.path.join(
+        request.config.rootdir,
+        '.github/scripts/oc_score_baseline_fullbench.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    return config
+
+
@pytest.fixture()
 def result_scores():
    file = find_csv_files(output_path)
@ -57,100 +87,231 @@ def result_scores():


@pytest.mark.usefixtures('result_scores')
-@pytest.mark.usefixtures('baseline_scores')
+@pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.chat
 class TestChat:
    """Test cases for chat model."""

-    @pytest.mark.parametrize('model, dataset', [(p1, p2)
-                                                for p1 in chat_model_list
-                                                for p2 in dataset_list])
-    def test_model_dataset_score(self, baseline_scores, result_scores, model,
-                                 dataset):
-        base_score = baseline_scores.get(model).get(dataset)
+    @pytest.mark.parametrize('model, dataset',
+                             [(p1, p2) for p1 in chat_model_list
+                              for p2 in ['gsm8k', 'race-high']])
+    def test_model_dataset_score(self, baseline_scores_testrange,
+                                 result_scores, model, dataset):
+        base_score = baseline_scores_testrange.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, base_score)
+        assert_score(model, result_score, base_score)


@pytest.mark.usefixtures('result_scores')
-@pytest.mark.usefixtures('baseline_scores')
+@pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.base
 class TestBase:
    """Test cases for base model."""

-    @pytest.mark.parametrize('model, dataset', [(p1, p2)
-                                                for p1 in base_model_list
-                                                for p2 in dataset_list])
-    def test_model_dataset_score(self, baseline_scores, result_scores, model,
-                                 dataset):
-        if model == 'mistral-7b-v0.2-vllm' and dataset == 'race-high':
+    @pytest.mark.parametrize(
+        'model, dataset',
+        [(p1, p2) for p1 in base_model_list
+         for p2 in ['gsm8k', 'GPQA_diamond', 'race-high', 'winogrande']])
+    def test_model_dataset_score(self, baseline_scores_testrange,
+                                 result_scores, model, dataset):
+        if model in ['gemma-2b-vllm', 'gemma-7b-vllm'] and dataset != 'gsm8k':
            return
-        base_score = baseline_scores.get(model).get(dataset)
+        base_score = baseline_scores_testrange.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, base_score)
+        assert_score(model, result_score, base_score)


@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores_fullbench')
+@pytest.mark.chat_obj_fullbench
+class TestChatObjFullbench:
+    """Test cases for chat model."""
+
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
+        'internlm2_5-7b-chat-hf_fullbench',
+        'internlm2_5-7b-chat-turbomind_fullbench'
+    ] for p2 in [
+        'race-high', 'ARC-c', 'BoolQ', 'triviaqa_wiki_1shot', 'nq_open_1shot',
+        'IFEval', 'drop', 'GPQA_diamond', 'hellaswag', 'TheoremQA',
+        'musr_average', 'gsm8k', 'math', 'cmo_fib', 'aime2024',
+        'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000',
+        'lcb_code_generation', 'lcb_code_execution', 'lcb_test_output',
+        'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
+        'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas',
+        'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn',
+        'ds1000_Pytorch', 'ds1000_Matplotlib', 'openai_mmmlu_lite_AR-XY',
+        'college', 'college_knowledge'
+    ]])
+    def test_model_dataset_score(self, baseline_scores_fullbench,
+                                 result_scores, model, dataset):
+        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model, result_score, base_score)
+
+
+@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores_fullbench')
+@pytest.mark.chat_sub_fullbench
+class TestChatSubFullbench:
+    """Test cases for chat model."""
+
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
+        'internlm2_5-7b-chat-hf_fullbench',
+        'internlm2_5-7b-chat-turbomind_fullbench'
+    ] for p2 in [
+        'Alignbench总分', 'Alignbench专业能力', 'AlpacaEvaltotal',
+        'AlpacaEvalhelpful_base', 'CompassArenacompassarena_language',
+        'CompassArenacompassarena_knowledge',
+        'CompassArenacompassarena_reason_v2',
+        'CompassArenacompassarena_math_v2',
+        'CompassArenacompassarena_creationv2_zh', 'Fofofofo_test_prompts',
+        'followbenchHSR_AVG', 'followbenchSSR_AVG', 'followbenchHSR_L1',
+        'followbenchHSR_L2', 'followbenchHSR_L3', 'followbenchHSR_L4',
+        'followbenchHSR_L5', 'followbenchSSR_L1', 'followbenchSSR_L2',
+        'followbenchSSR_L3', 'followbenchSSR_L4', 'followbenchSSR_L5',
+        'MTBench101average', 'Wildbenchscore'
+    ]])
+    def test_model_dataset_score(self, baseline_scores_fullbench,
+                                 result_scores, model, dataset):
+        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model, result_score, base_score)
+
+
+@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores_fullbench')
+@pytest.mark.base_fullbench
+class TestBaseFullbench:
+    """Test cases for chat model."""
+
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
+        'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench'
+    ] for p2 in [
+        'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
+        'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k',
+        'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag',
+        'TheoremQA', 'dingo_en_192', 'dingo_zh_170', 'college',
+        'college_knowledge', 'bbh-logical_deduction_seven_objects',
+        'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific',
+        'mmlu_pro_math'
+    ]])
+    def test_model_dataset_score(self, baseline_scores_fullbench,
+                                 result_scores, model, dataset):
+        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model, result_score, base_score)
+
+
+@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores')
+@pytest.mark.api
+class TestApibench:
+    """Test cases for chat model."""
+
+    @pytest.mark.parametrize('model, dataset',
+                             [('lmdeploy-api-test', 'race-middle'),
+                              ('lmdeploy-api-test', 'race-high'),
+                              ('lmdeploy-api-test', 'gsm8k')])
+    def test_api(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model + '_batch', result_score, base_score)
+
+
+@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores')
 class TestCmdCase:

    @pytest.mark.case1
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b-hf', 'race-middle'),
-                              ('internlm2_5-7b-hf', 'race-high')])
-    def test_cmd_case1(self, result_scores, model, dataset):
-        if len(result_scores.keys()) != 1:
-            assert False, 'result is none'
+                              ('internlm2_5-7b-hf', 'race-high'),
+                              ('internlm2_5-7b-hf', 'demo_gsm8k'),
+                              ('internlm2-1.8b-hf', 'race-middle'),
+                              ('internlm2-1.8b-hf', 'race-high'),
+                              ('internlm2-1.8b-hf', 'demo_gsm8k')])
+    def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, 91)
+        assert_score(model, result_score, base_score)

    @pytest.mark.case2
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b-chat-lmdeploy', 'race-middle'),
-                              ('internlm2_5-7b-chat-lmdeploy', 'race-high')])
-    def test_cmd_case2(self, result_scores, model, dataset):
-        if len(result_scores.keys()) != 1:
-            assert False, 'result is none'
+                              ('internlm2_5-7b-chat-lmdeploy', 'race-high'),
+                              ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k'),
+                              ('internlm2-chat-1.8b-lmdeploy', 'race-middle'),
+                              ('internlm2-chat-1.8b-lmdeploy', 'race-high'),
+                              ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k')])
+    def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, 91)
+        assert_score(model + '_batch', result_score, base_score)

    @pytest.mark.case3
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b_hf', 'race-middle'),
-                              ('internlm2_5-7b_hf', 'race-high')])
-    def test_cmd_case3(self, result_scores, model, dataset):
-        if len(result_scores.keys()) != 1:
-            assert False, 'result is none'
+                              ('internlm2_5-7b_hf', 'race-high'),
+                              ('internlm2_5-7b_hf', 'demo_gsm8k')])
+    def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, 91)
+        assert_score(model, result_score, base_score)

    @pytest.mark.case4
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b-chat_hf', 'race-middle'),
-                              ('internlm2_5-7b-chat_hf', 'race-high')])
-    def test_cmd_case4(self, result_scores, model, dataset):
-        if len(result_scores.keys()) != 1:
-            assert False, 'result is none'
+                              ('internlm2_5-7b-chat_hf', 'race-high'),
+                              ('internlm2_5-7b-chat_hf', 'demo_gsm8k')])
+    def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, 91)
+        assert_score(model, result_score, base_score)


-def assert_score(score, baseline):
+THRESHOLD = 3
+
+
+def assert_score(model_type, score, baseline):
    if score is None or score == '-':
        assert False, 'value is none'
-    if float(score) <= (baseline + 5) and float(score) >= (baseline - 5):
-        print(score + ' between ' + str(baseline - 5) + ' and ' +
-              str(baseline + 5))
-        assert True
+
+    if 'batch' not in model_type:
+        if float(score) <= (baseline + 0.01) and float(score) >= (baseline -
+                                                                  0.01):
+            print(' '.join([score, 'is equal', str(baseline)]))
+            assert True
+        else:
+            print(' '.join([score, 'is not equal', str(baseline)]))
+            assert False, ' '.join([score, 'is not equal', str(baseline)])
    else:
-        assert False, score + ' not between ' + str(
-            baseline - 5) + ' and ' + str(baseline + 5)
+        if float(score) <= (baseline + THRESHOLD) and float(score) >= (
+                baseline - THRESHOLD):
+            print(' '.join([
+                score, 'is between',
+                str(baseline - THRESHOLD), 'and',
+                str(baseline + THRESHOLD)
+            ]))
+            assert True
+        else:
+            print(' '.join([
+                score, 'is not etween',
+                str(baseline - THRESHOLD), 'and',
+                str(baseline + THRESHOLD)
+            ]))
+            assert False, ' '.join([
+                score, 'is not etween',
+                str(baseline - THRESHOLD), 'and',
+                str(baseline + THRESHOLD)
+            ])


 def find_csv_files(directory):
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
-            if file.endswith('.csv'):
+            if file.endswith('.csv') and (file.startswith('summary') or
+                                          file.startswith('Subjective_all')):
                csv_files.append(os.path.join(root, file))

    csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
@ -163,14 +324,24 @@ def read_csv_file(file_path):
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        filtered_data = []
-
-        for row in reader:
-            filtered_row = {
-                k: v
-                for k, v in row.items()
-                if k not in ['version', 'metric', 'mode']
-            }
-            filtered_data.append(filtered_row)
+        if 'Subjective_all' not in file_path:
+            for row in reader:
+                if row['metric'] is not None and 'bpb' not in row['metric']:
+                    filtered_row = {
+                        k: v
+                        for k, v in row.items()
+                        if k not in ['version', 'metric', 'mode']
+                    }
+                    filtered_data.append(filtered_row)
+        else:
+            for row in reader:
+                if row['Detailed Scores'] is not None:
+                    filtered_row = row
+                    filtered_row['dataset'] = filtered_row[
+                        'Dataset'] + filtered_row['Detailed Scores']
+                    del filtered_row['Dataset']
+                    del filtered_row['Detailed Scores']
+                    filtered_data.append(filtered_row)

    result = {}
    for data in filtered_data:
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -1,369 +1,34 @@
-baichuan2-7b-chat-hf:
-    gsm8k: 30
-    race-middle: 74
-    race-high: 79
+internlm2_5-7b-hf:
+    demo_gsm8k: 42.19
+    race-middle: 91.78
+    race-high: 90.02

-glm-4-9b-chat-hf:
-    gsm8k: 75
-    race-middle: 88
-    race-high: 88
+internlm2_5-7b_hf:
+    demo_gsm8k: 42.19
+    race-middle: 91.78
+    race-high: 90.02

-glm-4-9b-chat-turbomind:
-    gsm8k: 69
-    race-middle: 82
-    race-high: 77
+internlm2-1.8b-hf:
+    demo_gsm8k: 15.62
+    race-middle: 71.66
+    race-high: 66.38

-glm-4-9b-chat-vllm:
-    gsm8k: 73
-    race-middle: 87
-    race-high: 87
+internlm2_5-7b-chat-lmdeploy:
+    demo_gsm8k: 84.38
+    race-middle: 92.76
+    race-high: 90.54

-deepseek-7b-chat-hf:
-    gsm8k: 60
-    race-middle: 74
-    race-high: 80
+internlm2-chat-1.8b-lmdeploy:
+    demo_gsm8k: 31
+    race-middle: 81.34
+    race-high: 73.96

-deepseek-moe-16b-chat-hf:
-    gsm8k: 62
-    race-middle: 62
-    race-high: 70
-
-deepseek-v2-lite-chat-hf:
-    gsm8k: 59
-    race-middle: 82
-    race-high: 79
-
-deepseek-7b-chat-vllm:
-    gsm8k: 63
-    race-middle: 74
-    race-high: 79
-
-gemma-2b-it-hf:
-    gsm8k: 14
-    race-middle: 62
-    race-high: 52
-
-gemma-7b-it-hf:
-    gsm8k: 39
-    race-middle: 74
-    race-high: 71
-
-gemma-7b-it-vllm:
-    gsm8k: 38
-    race-middle: 75
-    race-high: 70
-
-gemma2-2b-it-hf:
-    gsm8k: 62
-    race-middle: 75
-    race-high: 67
-
-gemma2-9b-it-hf:
-    gsm8k: 80
-    race-middle: 89
-    race-high: 85
-
-internlm2_5-7b-chat-hf:
-    gsm8k: 86
-    race-middle: 92
-    race-high: 93
-
-internlm2_5-20b-chat-hf:
-    gsm8k: 91
-    race-middle: 95
-    race-high: 91
-
-internlm2_5-7b-chat-turbomind:
-    gsm8k: 87
-    race-middle: 92
-    race-high: 93
-
-internlm2_5-20b-chat-turbomind:
-    gsm8k: 91
-    race-middle: 95
-    race-high: 91
-
-internlm2-chat-1.8b-turbomind:
-    gsm8k: 40
-    race-middle: 82
-    race-high: 83
-
-internlm2-chat-1.8b-sft-turbomind:
-    gsm8k: 34
-    race-middle: 81
-    race-high: 83
-
-internlm2-chat-7b-lmdeploy:
-    gsm8k: 69
-    race-middle: 90
-    race-high: 88
-
-internlm2-chat-7b-sft-turbomind:
-    gsm8k: 71
-    race-middle: 91
-    race-high: 92
-
-internlm2-chat-7b-vllm:
-    gsm8k: 63
-    race-middle: 90
-    race-high: 91
-
-llama-3_1-8b-instruct-hf:
-    gsm8k: 82
-    race-middle: 82
-    race-high: 88
-
-llama-3-8b-instruct-hf:
-    gsm8k: 77
-    race-middle: 85
-    race-high: 87
-
-llama-3_1-8b-instruct-turbomind:
-    gsm8k: 79
-    race-middle: 82
-    race-high: 88
-
-llama-3-8b-instruct-turbomind:
-    gsm8k: 77
-    race-middle: 85
-    race-high: 89
-
-mistral-7b-instruct-v0.2-hf:
-    gsm8k: 48
-    race-middle: 82
-    race-high: 78
-
-mistral-7b-instruct-v0.3-hf:
-    gsm8k: 53
-    race-middle: 80
-    race-high: 78
-
-mistral-7b-instruct-v0.2-vllm:
-    gsm8k: 49
-    race-middle: 81
-    race-high: 77
-
-minicpm-2b-dpo-fp32-hf:
-    gsm8k: 58
-    race-middle: 66
-    race-high: 74
-
-minicpm-2b-sft-bf16-hf:
-    gsm8k: 58
-    race-middle: 75
-    race-high: 81
-
-minicpm-2b-sft-fp32-hf:
-    gsm8k: 58
-    race-middle: 75
-    race-high: 81
-
-phi-3-mini-4k-instruct-hf:
-    gsm8k: 67
-    race-middle: 81
-    race-high: 84
-
-phi-3-small-8k-instruct-hf:
-    gsm8k: 88
-    race-middle: 89
-    race-high: 88
-
-qwen1.5-0.5b-chat-hf:
-    gsm8k: 5
-    race-middle: 55
-    race-high: 50
-
-qwen2-1.5b-instruct-hf:
-    gsm8k: 63
-    race-middle: 77
-    race-high: 86
-
-qwen2-1.5b-instruct-turbomind:
-    gsm8k: 60
-    race-middle: 77
-    race-high: 86
-
-qwen2-7b-instruct-turbomind:
-    gsm8k: 88
-    race-middle: 87
-    race-high: 89
-
-qwen2-7b-instruct-hf:
-    gsm8k: 85
-    race-middle: 87
-    race-high: 91
-
-qwen1.5-0.5b-chat-vllm:
-    gsm8k: 5
-    race-middle: 57
-    race-high: 51
-
-yi-1.5-6b-chat-hf:
-    gsm8k: 72
-    race-middle: 88
-    race-high: 86
-
-yi-1.5-9b-chat-hf:
-    gsm8k: 81
-    race-middle: 89
-    race-high: 91
+internlm2_5-7b-chat_hf:
+    demo_gsm8k: 87.50
+    race-middle: 92.76
+    race-high: 90.48

 lmdeploy-api-test:
-    gsm8k: 90
-    race-middle: 95
-    race-high: 96
-
-deepseek-moe-16b-base-hf:
-    gsm8k: 25
-    race-middle: 35
-    race-high: 23
-
-deepseek-v2-lite-hf:
-    gsm8k: 37
-    race-middle: 56
-    race-high: 62
-
-deepseek-7b-base-turbomind:
-    gsm8k: 21
-    race-middle: 42
-    race-high: 42
-
-deepseek-moe-16b-base-vllm:
-    gsm8k: 22
-    race-middle: 35
-    race-high: 20
-
-gemma-2b-hf:
-    gsm8k: 19
-    race-middle: 33
-    race-high: 26
-
-gemma-7b-hf:
-    gsm8k: 65
-    race-middle: 59
-    race-high: 66
-
-gemma2-2b-hf:
-    gsm8k: 33
-    race-middle: 56
-    race-high: 58
-
-gemma2-9b-hf:
-    gsm8k: 70
-    race-middle: 82
-    race-high: 84
-
-internlm2_5-7b-hf:
-    gsm8k: 47
-    race-middle: 92
-    race-high: 91
-
-internlm2-7b-hf:
-    gsm8k: 65
-    race-middle: 77
-    race-high: 72
-
-internlm2-base-7b-hf:
-    gsm8k: 5
-    race-middle: 71
-    race-high: 74
-
-internlm2_5-7b-turbomind:
-    gsm8k: 73
-    race-middle: 90
-    race-high: 91
-
-internlm2-1.8b-turbomind:
-    gsm8k: 25
-    race-middle: 75
-    race-high: 72
-
-internlm2-7b-turbomind:
-    gsm8k: 67
-    race-middle: 78
-    race-high: 76
-
-internlm2-base-7b-turbomind:
-    gsm8k: 39
-    race-middle: 75
-    race-high: 81
-
-llama-2-7b-hf:
-    gsm8k: 17
-    race-middle: 32
-    race-high: 38
-
-llama-3-8b-hf:
-    gsm8k: 48
-    race-middle: 64
-    race-high: 70
-
-llama-3.1-8b-turbomind:
-    gsm8k: 57
-    race-middle: 67
-    race-high: 75
-
-llama-3-8b-turbomind:
-    gsm8k: 52
-    race-middle: 63
-    race-high: 70
-
-mistral-7b-v0.2-hf:
-    gsm8k: 43
-    race-middle: 42
-    race-high: 60
-
-mistral-7b-v0.3-hf:
-    gsm8k: 43
-    race-middle: 42
-    race-high: 60
-
-mistral-7b-v0.2-vllm:
-    gsm8k: 45
-    race-middle: 42
-    race-high: 58
-
-qwen1.5-moe-a2.7b-hf:
-    gsm8k: 64
-    race-middle: 78
-    race-high: 90
-
-qwen2-1.5b-hf:
-    gsm8k: 58
-    race-middle: 65
-    race-high: 78
-
-qwen2-0.5b-hf:
-    gsm8k: 35
-    race-middle: 52
-    race-high: 48
-
-qwen2-7b-hf:
-    gsm8k: 82
-    race-middle: 88
-    race-high: 89
-
-qwen2-1.5b-turbomind:
-    gsm8k: 57
-    race-middle: 64
-    race-high: 78
-
-qwen2-7b-turbomind:
-    gsm8k: 83
-    race-middle: 88
-    race-high: 88
-
-qwen1.5-0.5b-vllm:
-    gsm8k: 12
-    race-middle: 54
-    race-high: 59
-
-yi-1.5-6b-hf:
-    gsm8k: 59
-    race-middle: 81
-    race-high: 89
-
-yi-1.5-9b-hf:
-    gsm8k: 77
-    race-middle: 90
-    race-high: 90
+    gsm8k: 83.78
+    race-middle: 92.41
+    race-high: 90.37
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -0,0 +1,173 @@
+internlm2_5-7b-chat-hf_fullbench:
+    race-high: 93.75
+    ARC-c: 93.75
+    BoolQ: 81.25
+    triviaqa_wiki_1shot: 50
+    nq_open_1shot: 25
+    IFEval: 50
+    drop: 81.25
+    GPQA_diamond: 25
+    hellaswag: 87.5
+    TheoremQA: 18.75
+    musr_average: 39.58
+    gsm8k: 56.25
+    math: 75
+    cmo_fib: 6.25
+    aime2024: 6.25
+    wikibench-wiki-single_choice_cncircular: 50
+    sanitized_mbpp: 68.75
+    ds1000: 16.96
+    lcb_code_generation: 12.5
+    lcb_code_execution: 43.75
+    lcb_test_output: 18.75
+    bbh-logical_deduction_seven_objects: 50
+    bbh-multistep_arithmetic_two: 68.75
+    mmlu-other: 72.6
+    cmmlu-china-specific: 76.25
+    mmlu_pro_math: 25
+    ds1000_Pandas: 12.5
+    ds1000_Numpy: 0
+    ds1000_Tensorflow: 12.5
+    ds1000_Scipy: 18.75
+    ds1000_Sklearn: 18.75
+    ds1000_Pytorch: 12.5
+    ds1000_Matplotlib: 43.75
+    openai_mmmlu_lite_AR-XY: 37.5
+    college: 12.5
+    college_knowledge: 87.5
+    Alignbench总分: 0.65
+    Alignbench专业能力: 7.83
+    AlpacaEvaltotal: 0
+    AlpacaEvalhelpful_base: 0
+    CompassArenacompassarena_language: 60
+    CompassArenacompassarena_knowledge: 56
+    CompassArenacompassarena_reason_v2: 50
+    CompassArenacompassarena_math_v2: 53.5
+    CompassArenacompassarena_creationv2_zh: 48.75
+    Fofofofo_test_prompts: 1
+    followbenchHSR_AVG: 1
+    followbenchSSR_AVG: 1
+    followbenchHSR_L1: 1
+    followbenchHSR_L2: 1
+    followbenchHSR_L3: 1
+    followbenchHSR_L4: 1
+    followbenchHSR_L5: 1
+    followbenchSSR_L1: 1
+    followbenchSSR_L2: 1
+    followbenchSSR_L3: 1
+    followbenchSSR_L4: 1
+    followbenchSSR_L5: 1
+    MTBench101average: 8.1
+    Wildbenchscore: -3.3333333333333335
+
+internlm2_5-7b-chat-turbomind_fullbench:
+    race-high: 93.75
+    ARC-c: 87.5
+    BoolQ: 68.75
+    triviaqa_wiki_1shot: 50
+    nq_open_1shot: 25
+    IFEval: 50
+    drop: 75
+    hellaswag: 81.25
+    TheoremQA: 6.25
+    musr_average: 39.58
+    gsm8k: 68.75
+    math: 75
+    GPQA_diamond: 25
+    cmo_fib: 6.25
+    aime2024: 6.25
+    wikibench-wiki-single_choice_cncircular: 25
+    sanitized_mbpp: 68.75
+    ds1000: 13.39
+    lcb_code_generation: 12.5
+    lcb_code_execution: 43.75
+    lcb_test_output: 12.5
+    bbh-logical_deduction_seven_objects: 56.25
+    bbh-multistep_arithmetic_two: 68.75
+    mmlu-other: 74.04
+    cmmlu-china-specific: 76.25
+    mmlu_pro_math: 25
+    ds1000_Pandas: 0
+    ds1000_Numpy: 0
+    ds1000_Tensorflow: 12.5
+    ds1000_Scipy: 18.75
+    ds1000_Sklearn: 18.75
+    ds1000_Pytorch: 6.25
+    ds1000_Matplotlib: 37.5
+    openai_mmmlu_lite_AR-XY: 37.5
+    college: 0
+    college_knowledge: 87.5
+    Alignbench总分: 0.64
+    Alignbench专业能力: 7.6
+    AlpacaEvaltotal: 10
+    AlpacaEvalhelpful_base: 10
+    CompassArenacompassarena_language: 59
+    CompassArenacompassarena_knowledge: 57
+    CompassArenacompassarena_reason_v2: 49.5
+    CompassArenacompassarena_math_v2: 51
+    CompassArenacompassarena_creationv2_zh: 43.75
+    Fofofofo_test_prompts: 1
+    followbenchHSR_AVG: 1
+    followbenchSSR_AVG: 1
+    followbenchHSR_L1: 1
+    followbenchHSR_L2: 1
+    followbenchHSR_L3: 1
+    followbenchHSR_L4: 1
+    followbenchHSR_L5: 1
+    followbenchSSR_L1: 1
+    followbenchSSR_L2: 1
+    followbenchSSR_L3: 1
+    followbenchSSR_L4: 1
+    followbenchSSR_L5: 1
+    MTBench101average: 8.1
+    Wildbenchscore: -8.333333333333334
+
+internlm2_5-7b-hf_fullbench:
+    race-high: 100
+    ARC-c: 68.75
+    BoolQ: 87.5
+    GPQA_diamond: 62.5
+    drop: 62.5
+    math: 12.5
+    wikibench-wiki-single_choice_cncircular: 25
+    sanitized_mbpp: 56.25
+    gsm8k: 37.5
+    triviaqa_wiki_1shot: 43.75
+    nq_open_1shot: 43.75
+    winogrande: 75
+    hellaswag: 93.75
+    TheoremQA: 25
+    dingo_en_192: 37.5
+    dingo_zh_170: 100
+    college: 12.5
+    college_knowledge: 87.5
+    bbh-logical_deduction_seven_objects: 43.75
+    bbh-multistep_arithmetic_two: 56.25
+    mmlu-other: 76.92
+    cmmlu-china-specific: 84.17
+    mmlu_pro_math: 18.75
+
+internlm2_5-7b-turbomind_fullbench:
+    race-high: 100
+    ARC-c: 68.75
+    BoolQ: 87.5
+    GPQA_diamond: 62.5
+    drop: 62.5
+    math: 18.75
+    wikibench-wiki-single_choice_cncircular: 25
+    sanitized_mbpp: 56.25
+    gsm8k: 68.75
+    triviaqa_wiki_1shot: 43.75
+    nq_open_1shot: 43.75
+    winogrande: 87.5
+    hellaswag: 93.75
+    TheoremQA: 31.25
+    dingo_en_192: 43.75
+    dingo_zh_170: 100
+    college: 12.5
+    college_knowledge: 87.5
+    bbh-logical_deduction_seven_objects: 50
+    bbh-multistep_arithmetic_two: 56.25
+    mmlu-other: 76.92
+    cmmlu-china-specific: 84.17
+    mmlu_pro_math: 18.75
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@ -0,0 +1,459 @@
+baichuan2-7b-chat-hf:
+    gsm8k: 18.75
+    race-high: 78.12
+
+glm-4-9b-chat-hf:
+    gsm8k: 68.75
+    race-high: 90.62
+
+glm-4-9b-chat-turbomind:
+    gsm8k: 75.00
+    race-high: 90.62
+
+glm-4-9b-chat-vllm:
+    gsm8k: 65.62
+    race-high: 90.62
+
+deepseek-7b-chat-hf:
+    gsm8k: 46.88
+    race-high: 81.25
+
+deepseek-moe-16b-chat-hf:
+    gsm8k: 50
+    race-high: 68.75
+
+deepseek-7b-chat-vllm:
+    gsm8k: 43.75
+    race-high: 75
+
+gemma2-2b-it-hf:
+    gsm8k: 50
+    race-high: 71.88
+
+gemma2-9b-it-hf:
+    gsm8k: 71.88
+    race-high: 84.38
+
+gemma-2b-it-hf:
+    gsm8k: 3.12
+    race-high: 40.62
+
+gemma-7b-it-hf:
+    gsm8k: 40.62
+    race-high: 68.75
+
+gemma-2-9b-it-turbomind:
+    gsm8k: 65.62
+    race-high: 84.38
+
+gemma-7b-it-vllm:
+    gsm8k: 34.38
+    race-high: 68.75
+
+internlm2_5-7b-chat-hf:
+    gsm8k: 84.38
+    race-high: 90.62
+
+internlm2_5-7b-chat-turbomind:
+    gsm8k: 84.38
+    race-high: 90.62
+
+internlm2-chat-1.8b-turbomind:
+    gsm8k: 25
+    race-high: 84.38
+
+internlm2-chat-1.8b-sft-turbomind:
+    gsm8k: 21.88
+    race-high: 84.38
+
+internlm2-chat-7b-lmdeploy:
+    gsm8k: 53.12
+    race-high: 84.38
+
+internlm2-chat-7b-sft-turbomind:
+    gsm8k: 50
+    race-high: 90.62
+
+internlm2-chat-7b-vllm:
+    gsm8k: 43.75
+    race-high: 87.5
+
+llama-3_1-8b-instruct-hf:
+    gsm8k: 84.38
+    race-high: 90.62
+
+llama-3_2-3b-instruct-hf:
+    gsm8k: 65.62
+    race-high: 81.25
+
+llama-3-8b-instruct-hf:
+    gsm8k: 68.75
+    race-high: 87.5
+
+llama-3_1-8b-instruct-turbomind:
+    gsm8k: 78.12
+    race-high: 90.62
+
+llama-3_2-3b-instruct-turbomind:
+    gsm8k: 62.50
+    race-high: 81.25
+
+llama-3-8b-instruct-turbomind:
+    gsm8k: 68.75
+    race-high: 87.5
+
+mistral-7b-instruct-v0.2-hf:
+    gsm8k: 40.62
+    race-high: 75
+
+mistral-7b-instruct-v0.3-hf:
+    gsm8k: 40.62
+    race-high: 75
+
+mistral-nemo-instruct-2407-hf:
+    gsm8k: 75
+    race-high: 81.25
+
+mistral-nemo-instruct-2407-turbomind:
+    gsm8k: 68.75
+    race-high: 87.50
+
+mistral-7b-instruct-v0.1-vllm:
+    gsm8k: 34.38
+    race-high: 68.75
+
+mistral-7b-instruct-v0.2-vllm:
+    gsm8k: 43.75
+    race-high: 75
+
+MiniCPM3-4B-hf:
+    gsm8k: 68.75
+    race-high: 84.38
+
+minicpm-2b-dpo-fp32-hf:
+    gsm8k: 56.25
+    race-high: 53.12
+
+minicpm-2b-sft-bf16-hf:
+    gsm8k: 46.88
+    race-high: 65.62
+
+minicpm-2b-sft-fp32-hf:
+    gsm8k: 46.88
+    race-high: 65.62
+
+phi-3-mini-4k-instruct-hf:
+    gsm8k: 56.25
+    race-high: 84.38
+
+qwen1.5-0.5b-chat-hf:
+    gsm8k: 0
+    race-high: 53.12
+
+qwen2-1.5b-instruct-hf:
+    gsm8k: 62.5
+    race-high: 84.38
+
+qwen2-7b-instruct-hf:
+    gsm8k: 68.75
+    race-high: 90.62
+
+qwen2-1.5b-instruct-turbomind:
+    gsm8k: 62.50
+    race-high: 84.38
+
+qwen2-7b-instruct-turbomind:
+    gsm8k: 81.25
+    race-high: 87.5
+
+qwen1.5-0.5b-chat-vllm:
+    gsm8k: 3.12
+    race-high: 53.12
+
+yi-1.5-6b-chat-hf:
+    gsm8k: 65.62
+    race-high: 84.38
+
+yi-1.5-9b-chat-hf:
+    gsm8k: 75
+    race-high: 93.75
+
+deepseek-v2-lite-chat-hf:
+    gsm8k: 43.75
+    race-high: 71.88
+
+internlm2_5-20b-chat-hf:
+    gsm8k: 84.38
+    race-high: 87.5
+
+internlm2_5-20b-chat-turbomind:
+    gsm8k: 84.38
+    race-high: 87.5
+
+mistral-small-instruct-2409-hf:
+    gsm8k: 81.25
+    race-high: 87.50
+
+mistral-small-instruct-2409-turbomind:
+    gsm8k: 78.12
+    race-high: 87.50
+
+qwen2.5-14b-instruct-hf:
+    gsm8k: 71.88
+    race-high: 96.88
+
+qwen2.5-14b-instruct-turbomind:
+    gsm8k: 71.88
+    race-high: 93.75
+
+glm-4-9b-hf:
+    gsm8k: 68.75
+    GPQA_diamond: 31.25
+    race-high: 93.75
+    winogrande: 84.38
+
+deepseek-moe-16b-base-hf:
+    gsm8k: 21.88
+    GPQA_diamond: 0
+    race-high: 21.88
+    winogrande: 65.62
+
+deepseek-7b-base-turbomind:
+    gsm8k: 21.88
+    GPQA_diamond: 0
+    race-high: 46.88
+    winogrande: 84.38
+
+deepseek-moe-16b-base-vllm:
+    gsm8k: 21.88
+    GPQA_diamond: 0
+    race-high: 25
+    winogrande: 68.75
+
+gemma2-2b-hf:
+    gsm8k: 31.25
+    GPQA_diamond: 3.12
+    race-high: 56.25
+    winogrande: 71.88
+
+gemma2-9b-hf:
+    gsm8k: 68.75
+    GPQA_diamond: 0
+    race-high: 81.25
+    winogrande: 84.38
+
+gemma-2b-hf:
+    gsm8k: 18.75
+    GPQA_diamond: 3.12
+    race-high: 25
+    winogrande: 53.12
+
+gemma-7b-hf:
+    gsm8k: 56.25
+    GPQA_diamond: 6.25
+    race-high: 65.62
+    winogrande: 78.12
+
+gemma-2b-vllm:
+    gsm8k: 15.62
+    GPQA_diamond: 6.25
+    race-high:
+    winogrande:
+
+gemma-7b-vllm:
+    gsm8k: 53.12
+    GPQA_diamond: 6.25
+    race-high:
+    winogrande:
+
+internlm2_5-7b-hf:
+    gsm8k: 37.5
+    GPQA_diamond: 25
+    race-high: 93.75
+    winogrande: 71.88
+
+internlm2-7b-hf:
+    gsm8k: 53.12
+    GPQA_diamond: 18.75
+    race-high: 62.5
+    winogrande: 78.12
+
+internlm2-base-7b-hf:
+    gsm8k: 3.12
+    GPQA_diamond: 21.88
+    race-high: 75
+    winogrande: 65.62
+
+internlm2-1.8b-turbomind:
+    gsm8k: 12.5
+    GPQA_diamond: 12.5
+    race-high: 71.88
+    winogrande: 75
+
+internlm2_5-7b-turbomind:
+    gsm8k: 68.75
+    GPQA_diamond: 31.25
+    race-high: 93.75
+    winogrande: 84.38
+
+internlm2-7b-turbomind:
+    gsm8k: 56.25
+    GPQA_diamond: 21.88
+    race-high: 75
+    winogrande: 81.25
+
+internlm2-base-7b-turbomind:
+    gsm8k: 40.62
+    GPQA_diamond: 28.12
+    race-high: 84.38
+    winogrande: 71.88
+
+llama-2-7b-hf:
+    gsm8k: 21.88
+    GPQA_diamond: 21.88
+    race-high: 40.62
+    winogrande: 71.88
+
+llama-3_1-8b-hf:
+    gsm8k: 78.12
+    GPQA_diamond: 25
+    race-high: 90.62
+    winogrande: 62.5
+
+llama-3-8b-hf:
+    gsm8k: 46.88
+    GPQA_diamond: 6.25
+    race-high: 65.62
+    winogrande: 65.62
+
+llama-3.1-8b-turbomind:
+    gsm8k: 56.25
+    GPQA_diamond: 6.25
+    race-high: 78.12
+    winogrande: 78.12
+
+llama-3-8b-turbomind:
+    gsm8k: 50
+    GPQA_diamond: 9.38
+    race-high: 65.62
+    winogrande: 78.12
+
+mistral-7b-v0.2-hf:
+    gsm8k: 31.25
+    GPQA_diamond: 6.25
+    race-high: 62.5
+    winogrande: 59.38
+
+mistral-7b-v0.3-hf:
+    gsm8k: 31.25
+    GPQA_diamond: 6.25
+    race-high: 62.5
+    winogrande: 59.38
+
+mistral-7b-v0.2-vllm:
+    gsm8k: 34.38
+    GPQA_diamond: 6.25
+    race-high: 62.5
+    winogrande: 65.62
+
+qwen2.5-7b-hf:
+    gsm8k: 81.25
+    GPQA_diamond: 18.75
+    race-high: 87.5
+    winogrande: 71.88
+
+qwen2.5-1.5b-turbomind:
+    gsm8k: 71.88
+    GPQA_diamond: 15.62
+    race-high: 78.12
+    winogrande: 71.88
+
+qwen2.5-7b-turbomind:
+    gsm8k: 71.88
+    GPQA_diamond: 25
+    race-high: 87.5
+    winogrande: 71.88
+
+qwen1.5-moe-a2.7b-hf:
+    gsm8k: 62.5
+    GPQA_diamond: 18.75
+    race-high: 84.38
+    winogrande: 75
+
+qwen2-0.5b-hf:
+    gsm8k: 25
+    GPQA_diamond: 0
+    race-high: 40.62
+    winogrande: 62.5
+
+qwen2-1.5b-hf:
+    gsm8k: 59.38
+    GPQA_diamond: 9.38
+    race-high: 81.25
+    winogrande: 62.5
+
+qwen2-7b-hf:
+    gsm8k: 68.75
+    GPQA_diamond: 9.38
+    race-high: 87.5
+    winogrande: 68.75
+
+qwen2-1.5b-turbomind:
+    gsm8k: 62.50
+    GPQA_diamond: 6.25
+    race-high: 81.25
+    winogrande: 75
+
+qwen2-7b-turbomind:
+    gsm8k: 68.75
+    GPQA_diamond: 12.5
+    race-high: 87.5
+    winogrande: 71.88
+
+qwen1.5-0.5b-vllm:
+    gsm8k: 9.38
+    GPQA_diamond: 0
+    race-high: 56.25
+    winogrande: 62.5
+
+yi-1.5-6b-hf:
+    gsm8k: 62.5
+    GPQA_diamond: 3.12
+    race-high: 87.5
+    winogrande: 62.5
+
+yi-1.5-9b-hf:
+    gsm8k: 75
+    GPQA_diamond: 40.62
+    race-high: 87.5
+    winogrande: 59.38
+
+deepseek-v2-lite-hf:
+    gsm8k: 28.12
+    GPQA_diamond: 21.88
+    race-high: 59.38
+    winogrande: 75
+
+internlm2-20b-hf:
+    gsm8k: 56.25
+    GPQA_diamond: 15.62
+    race-high: 68.75
+    winogrande: 75
+
+internlm2-base-20b-hf:
+    gsm8k: 12.5
+    GPQA_diamond: 9.38
+    race-high: 84.38
+    winogrande: 65.62
+
+internlm2-20b-turbomind:
+    gsm8k: 68.75
+    GPQA_diamond: 15.62
+    race-high: 68.75
+    winogrande: 81.25
+
+qwen2.5-14b-hf:
+    gsm8k: 75
+    GPQA_diamond: 37.5
+    race-high: 93.75
+    winogrande: 84.38
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -13,13 +13,33 @@ on:
        description: 'Set branch or tag or commit id. Default is "main"'
        type: string
        default: 'main'
+      build_lmdeploy:
+        required: false
+        description: 'whether to build lmdeploy'
+        type:  boolean
+        default: false
+      repo_org_lmdeploy:
+        required: false
+        description: 'Tested repository organization name. Default is internlm/lmdeploy'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref_lmdeploy:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
      regression_func:
        required: true
        description: 'regression functions'
        type: string
-        default: "['chat','base','cmd']"
+        default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']"
+      cuda_env:
+        required: true
+        description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']"
+        type: string
+        default: "['dsw_cu12']"
  schedule:
-    - cron:  '56 16 * * *'
+    - cron:  '15 16 * * *'

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
@ -31,7 +51,7 @@ env:
  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
  HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
  HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
-  DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
+  COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache
  HF_DATASETS_OFFLINE: 1
  HF_EVALUATE_OFFLINE: 1
  TRANSFORMERS_OFFLINE: 1
@ -39,6 +59,8 @@ env:
  LMDEPLOY_USE_MODELSCOPE: false
  HF_HUB_OFFLINE: 1
  TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
+  REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report
+  OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}

 jobs:
  build-pypi:
@ -64,16 +86,51 @@ jobs:
          retention-days: 1
          name: my-artifact-${{ github.run_id }}

-  daily_run_test:
+  build-pypi-lmdeploy:
+    if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.1
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+
+  prepare_env:
    if: ${{!cancelled()}}
-    needs: build-pypi
+    needs: ['build-pypi', 'build-pypi-lmdeploy']
    strategy:
      fail-fast: false
      matrix:
-        cuda_env: [dsw_cu11, dsw_cu12]
+        cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
    runs-on: ${{ matrix.cuda_env }}
    environment: 'prod'
-    timeout-minutes: 600 #10hours
+    timeout-minutes: 240 #4hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
@ -84,89 +141,169 @@ jobs:
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}
+      - name:  Remove Conda Env
+        if: always()
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda info --envs
      - name: Prepare - create conda env and install torch - cu11
        if: ${{matrix.cuda_env == 'dsw_cu11'}}
-        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
-          pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip uninstall torch torchvision torchaudio -y
-          pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
-          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          conda info --envs
-          pip list
+        uses: nick-fields/retry@v3
+        id: retry1
+        with:
+          max_attempts: 3
+          timeout_minutes: 40
+          command: |
+            . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+            conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
+            conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+            pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip uninstall torch torchvision torchaudio -y
+            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
+            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            conda info --envs
+            pip list
      - name: Prepare - create conda env and install torch - cu12
        if: ${{matrix.cuda_env == 'dsw_cu12'}}
+        uses: nick-fields/retry@v3
+        id: retry2
+        with:
+          max_attempts: 3
+          timeout_minutes: 40
+          command: |
+            . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+            conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
+            conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+            pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
+            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            conda info --envs
+            pip list
+      - name: Prepare - reinstall lmdeploy - cu12
+        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Prepare - reinstall lmdeploy - cu12
+        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
-          pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip uninstall torch torchvision torchaudio -y
-          pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
-          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          conda info --envs
-          pip list
+          pip install lmdeploy-*.whl --no-deps
+
+  daily_run_test:
+    if: ${{!cancelled()}}
+    needs: prepare_env
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
+        regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}}
+    runs-on: ${{ matrix.cuda_env }}
+    environment: 'prod'
+    timeout-minutes: 240 #4hours
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: Prepare - prepare data and hf model
        run: |
-          ln -s ${{env.DATEASET_CACHE_PATH}} data
          rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
      - name:  Run command testcase
-        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'cmd')
+        if: matrix.regression_func == 'cmd'
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
          export from_tf=TRUE
          python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run chat model test
-        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'chat')
+        if: matrix.regression_func == 'chat_models'
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
-          sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
-          opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run base model test
-        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'base')
+        if: matrix.regression_func == 'base_models'
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
-          opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
-      - name:  Remove Conda Env
-        if: always()
+      - name:  Run chat model test - fullbench
+        if: matrix.regression_func == 'chat_obj_fullbench'
        run: |
-          rm -rf regression_result_daily
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
+          opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Run chat model test - fullbench
+        if: matrix.regression_func == 'chat_sub_fullbench'
+        env:
+          COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda info --envs
+          opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Run base model test - fullbench
+        if: matrix.regression_func == 'base_fullbench'
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda info --envs
+          opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Run model test - api
+        if: matrix.regression_func == 'api'
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda info --envs
+          lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 120s
+          opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Run model test - api kill
+        if: always() && matrix.regression_func == 'api'
+        run: |
+          kill -15 "$restful_pid"

  notify_to_feishu:
    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@ -10,17 +10,6 @@ on:
      - 'tools/**'

  workflow_dispatch:
-    inputs:
-      repo_org:
-        required: false
-        description: 'Tested repository organization name. Default is open-compass/opencompass'
-        type: string
-        default: 'open-compass/opencompass'
-      repo_ref:
-        required: false
-        description: 'Set branch or tag or commit id. Default is "main"'
-        type: string
-        default: 'main'
  schedule:
    - cron:  '56 22 * * *'

@ -46,9 +35,6 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: Prepare - Install opencompass
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
--- a/README.md
+++ b/README.md
@ -57,6 +57,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through

 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>

+- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](configs/eval_musr.py) and give it a spin! 🔥🔥🔥
+- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](configs/eval_babilong.py) and give it a try! 🔥🔥🔥
 - **\[2024.10.14\]** We now support the OpenAI multilingual QA dataset [MMMLU](https://huggingface.co/datasets/openai/MMMLU). Feel free to give it a try! 🔥🔥🔥
 - **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥
 - **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥
--- a/configs/datasets/livecodebench/livecodebench_gen.py
+++ b/configs/datasets/livecodebench/livecodebench_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .livecodebench_gen_b2b0fd import LCB_datasets  # noqa: F401, F403
+    from .livecodebench_gen_6966bc import LCB_datasets  # noqa: F401, F403
--- a/configs/datasets/livecodebench/livecodebench_gen_6966bc.py
+++ b/configs/datasets/livecodebench/livecodebench_gen_6966bc.py
@ -0,0 +1,164 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+
+
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+
+
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+    ),
+    pred_role='BOT',
+)
+
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg
+)
+
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    LCBCodeExecution_dataset,
+    LCBTestOutput_dataset,
+]
--- a/configs/eval_babilong.py
+++ b/configs/eval_babilong.py
@ -0,0 +1,65 @@
+from mmengine.config import read_base
+
+with read_base():
+    # Models
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
+        models as lmdeploy_internlm2_5_7b_chat_model,
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import (
+        models as lmdeploy_llama3_1_8b_instruct_model,
+    )
+    from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import (
+        models as lmdeploy_ministral_8b_instruct_2410_model,
+    )
+
+    # Datasets
+    from opencompass.configs.datasets.babilong.babilong_0k_gen import (
+        babiLong_0k_datasets,
+    )
+    from opencompass.configs.datasets.babilong.babilong_4k_gen import (
+        babiLong_4k_datasets,
+    )
+    from opencompass.configs.datasets.babilong.babilong_16k_gen import (
+        babiLong_16k_datasets,
+    )
+    from opencompass.configs.datasets.babilong.babilong_32k_gen import (
+        babiLong_32k_datasets,
+    )
+    from opencompass.configs.datasets.babilong.babilong_128k_gen import (
+        babiLong_128k_datasets,
+    )
+    from opencompass.configs.datasets.babilong.babilong_256k_gen import (
+        babiLong_256k_datasets,
+    )
+    from opencompass.configs.summarizers.groups.babilong import (
+        babilong_summary_groups,
+    )
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+for model in models:
+    model['engine_config']['session_len'] = 1024 * 1024
+    model['max_seq_len'] = 1024 * 1024
+    model['engine_config']['tp'] = 4
+    model['run_cfg']['num_gpus'] = 4
+
+
+summarizer = dict(
+    dataset_abbrs=[
+        'babilong_0k',
+        'babilong_4k',
+        'babilong_16k',
+        'babilong_32k',
+        'babilong_128k',
+        'babilong_256k',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+    ),
+)
+
+work_dir = './outputs/babilong'
--- a/configs/eval_musr.py
+++ b/configs/eval_musr.py
@ -0,0 +1,44 @@
+from mmengine.config import read_base
+import os.path as osp
+
+with read_base():
+    from opencompass.configs.datasets.musr.musr_gen_3c6e15 import musr_datasets
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_5_1_8b_chat import models
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
+    models as lmdeploy_internlm2_5_7b_chat_model,
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as lmdeploy_qwen2_5_14b_instruct_model,
+    )
+    from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import (
+        models as lmdeploy_yi_1_5_9b_chat_model,
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import (
+        models as lmdeploy_qwen2_5_32b_instruct_model,
+    )
+    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import (
+        models as lmdeploy_glm4_9b_chat_model,
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import (
+        models as lmdeploy_llama3_1_8b_instruct_model,
+    )
+    from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import (
+        models as lmdeploy_ministral_8b_instruct_2410_model,
+    )
+    from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import (
+        models as lmdeploy_gemma_9b_it_model,
+    )
+    from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import (
+        models as lmdeploy_gemma_27b_it_model,
+    )
+    from opencompass.configs.summarizers.groups.musr_average import summarizer
+
+
+datasets = [*musr_datasets]
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+base_exp_dir = 'outputs/musr/'
+work_dir = osp.join(base_exp_dir, 'musr_eval')
--- a/opencompass/init.py
+++ b/opencompass/init.py
@ -1,4 +1,4 @@
-__version__ = '0.3.5'
+__version__ = '0.3.6'


 def _warn_about_config_migration():
--- a/opencompass/configs/datasets/babilong/README.md
+++ b/opencompass/configs/datasets/babilong/README.md
@ -0,0 +1,37 @@
+# BABILong
+OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). BABILong provides an evaluation of long-context reasoning across extremely long documents, including a diverse set of 20 reasoning tasks such as fact chaining, simple induction, deduction, counting, and handling lists/sets. This benchmark is designed to test the ability of language models to reason over facts distributed in long natural text, and it allows for the construction of tasks of almost arbitrary length to adapt to the evaluation of new, more powerful models in an extensible and controllable way.
+
+
+
+## How to Use
+The BABILong dataset is available on Hugging Face: [RMT-team/babilong](https://huggingface.co/datasets/RMT-team/babilong). Opencompass provides an automatic download for BABILong dataset, due to the dataset size, we only provide the data up to 1M tokens. For longer context, you can download the dataset from Hugging Face directly.
+
+BABILong paper provides in total 20 tasks, we provide 10 tasks configurations in OpenCompass and they are organized by different context sizes. You can create your own configurations by following the examples in `opencompass/configs/datasets/babilong/babilong_1m_gen.py`.
+
+Opencompass provides a demo for evaluating language models on the BABILong dataset.
+
+```bash
+opencompass configs/eval_babilong.py
+```
+OpenCompass provides the results of some models on the BABILong dataset. The evaluation results are run with LMDeploy with default model settings.
+
+| dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | qwen2.5-7b-instruct-turbomind | llama-3_1-8b-instruct-turbomind | ministral-8B-instruct-2410-turbomind |
+|----- | ----- | ----- | ----- | ----- | ----- | ----- | -----|
+| babilong_0k | - | naive_average | gen | 76.51 | 80.25 | 76.44 | 76.40 |
+| babilong_4k | - | naive_average | gen | 67.55 | 70.35 | 67.41 | 67.92 |
+| babilong_16k | - | naive_average | gen | 53.78 | 65.83 | 60.26 | 56.58 |
+| babilong_32k | - | naive_average | gen | 50.86 | 62.66 | 59.56 | 53.52 |
+| babilong_128k | - | naive_average | gen | 39.33 | 27.79 | 52.01 | 3.20 |
+| babilong_256k | - | naive_average | gen | 17.31 | 7.30 | 23.35 | 9.50 |
+
+## Citation
+
+```bibtex
+@misc{kuratov2024babilong,
+    title={BABILong: Testing the Limits of LLMs with Long Context Reasoning-in-a-Haystack}, 
+    author={Yuri Kuratov and Aydar Bulatov and Petr Anokhin and Ivan Rodkin and Dmitry Sorokin and Artyom Sorokin and Mikhail Burtsev},
+    year={2024},
+    eprint={2406.10149},
+    archivePrefix={arXiv}
+}
+```
--- a/opencompass/configs/datasets/babilong/babilong_0k_gen.py
+++ b/opencompass/configs/datasets/babilong/babilong_0k_gen.py
@ -0,0 +1,37 @@
+from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+
+babiLong_0k_datasets = []
+split_name='0k'
+tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
+
+
+for task in tasks:
+    tmp_dataset =  {
+        'abbr': f'babilong_{task}_{split_name}',
+        'type': BabiLongDataset,
+        'path': 'opencompass/babilong',
+        'task': task,
+        'split_name': split_name,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=BabiLongEvaluator),
+        ),
+    }
+    babiLong_0k_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/babilong/babilong_128k_gen.py
+++ b/opencompass/configs/datasets/babilong/babilong_128k_gen.py
@ -0,0 +1,38 @@
+from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+
+babiLong_128k_datasets = []
+split_name='128k'
+max_seq_len = 128*1024
+tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
+
+
+for task in tasks:
+    tmp_dataset =  {
+        'abbr': f'babilong_{task}_{split_name}',
+        'type': BabiLongDataset,
+        'path': 'opencompass/babilong',
+        'task': task,
+        'split_name': split_name,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=BabiLongEvaluator),
+        ),
+    }
+    babiLong_128k_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/babilong/babilong_16k_gen.py
+++ b/opencompass/configs/datasets/babilong/babilong_16k_gen.py
@ -0,0 +1,38 @@
+from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+
+babiLong_16k_datasets = []
+split_name='16k'
+max_seq_len = 16*1024
+tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
+
+
+for task in tasks:
+    tmp_dataset =  {
+        'abbr': f'babilong_{task}_{split_name}',
+        'type': BabiLongDataset,
+        'path': 'opencompass/babilong',
+        'task': task,
+        'split_name': split_name,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=BabiLongEvaluator),
+        ),
+    }
+    babiLong_16k_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/babilong/babilong_1m_gen.py
+++ b/opencompass/configs/datasets/babilong/babilong_1m_gen.py
@ -0,0 +1,37 @@
+from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+
+babiLong_1m_datasets = []
+split_name='1m'
+tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
+
+
+for task in tasks:
+    tmp_dataset =  {
+        'abbr': f'babilong_{task}_{split_name}',
+        'type': BabiLongDataset,
+        'path': 'opencompass/babilong',
+        'task': task,
+        'split_name': split_name,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=BabiLongEvaluator),
+        ),
+    }
+    babiLong_1m_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/babilong/babilong_256k_gen.py
+++ b/opencompass/configs/datasets/babilong/babilong_256k_gen.py
@ -0,0 +1,38 @@
+from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+
+babiLong_256k_datasets = []
+split_name='256k'
+max_seq_len = 256*1024
+tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
+
+
+for task in tasks:
+    tmp_dataset =  {
+        'abbr': f'babilong_{task}_{split_name}',
+        'type': BabiLongDataset,
+        'path': 'opencompass/babilong',
+        'task': task,
+        'split_name': split_name,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len ),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=BabiLongEvaluator),
+        ),
+    }
+    babiLong_256k_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/babilong/babilong_2k_gen.py
+++ b/opencompass/configs/datasets/babilong/babilong_2k_gen.py
@ -0,0 +1,38 @@
+from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+
+babiLong_2k_datasets = []
+split_name='2k'
+max_seq_len = 2*1024
+tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
+
+
+for task in tasks:
+    tmp_dataset =  {
+        'abbr': f'babilong_{task}_{split_name}',
+        'type': BabiLongDataset,
+        'path': 'opencompass/babilong',
+        'task': task,
+        'split_name': split_name,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer， max_seq_len=max_seq_len),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=BabiLongEvaluator),
+        ),
+    }
+    babiLong_2k_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/babilong/babilong_32k_gen.py
+++ b/opencompass/configs/datasets/babilong/babilong_32k_gen.py
@ -0,0 +1,38 @@
+from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+
+babiLong_32k_datasets = []
+split_name='32k'
+max_seq_len = 32*1024
+tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
+
+
+for task in tasks:
+    tmp_dataset =  {
+        'abbr': f'babilong_{task}_{split_name}',
+        'type': BabiLongDataset,
+        'path': 'opencompass/babilong',
+        'task': task,
+        'split_name': split_name,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=BabiLongEvaluator),
+        ),
+    }
+    babiLong_32k_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/babilong/babilong_4k_gen.py
+++ b/opencompass/configs/datasets/babilong/babilong_4k_gen.py
@ -0,0 +1,38 @@
+from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+
+babiLong_4k_datasets = []
+split_name='4k'
+max_seq_len=4*1024
+tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
+
+
+for task in tasks:
+    tmp_dataset =  {
+        'abbr': f'babilong_{task}_{split_name}',
+        'type': BabiLongDataset,
+        'path': 'opencompass/babilong',
+        'task': task,
+        'split_name': split_name,
+        'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='{prompt}'),
+                        dict(role='BOT', prompt='{answer}\n'),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(type=BabiLongEvaluator),
+        ),
+    }
+    babiLong_4k_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/livecodebench/livecodebench_gen.py
+++ b/opencompass/configs/datasets/livecodebench/livecodebench_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .livecodebench_gen_b2b0fd import LCB_datasets  # noqa: F401, F403
+    from .livecodebench_gen_6966bc import LCB_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py
+++ b/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py
@ -0,0 +1,164 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+
+
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+
+
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+    ),
+    pred_role='BOT',
+)
+
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg
+)
+
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    LCBCodeExecution_dataset,
+    LCBTestOutput_dataset,
+]
--- a/opencompass/configs/datasets/math/math_prm800k_500_gen.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_gen.py
@ -0,0 +1,36 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+# postprocess v2
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2),
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math_prm800k_500',
+        path='opencompass/math',
+        file_name = 'test_prm800k_500.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/musr/README.md
+++ b/opencompass/configs/datasets/musr/README.md
@ -0,0 +1,75 @@
+
+# MuSR: Multistep Soft Reasoning Dataset
+
+MuSR (Multistep Soft Reasoning) is a dataset designed to evaluate language models (LLMs) on complex reasoning tasks embedded in natural language narratives. Created to challenge state-of-the-art models like GPT-4 and others, MuSR emphasizes nuanced reasoning across different domains, including social and physical reasoning, commonsense reasoning, and planning, with tasks framed within realistic scenarios such as murder mysteries, object placements, and team allocations.
+
+## Overview
+
+### Purpose
+
+Current large language models can perform complex tasks through prompting techniques like chain-of-thought reasoning. However, robust multistep reasoning remains challenging. MuSR addresses these limitations by evaluating LLM performance on tasks involving multistep reasoning in three domains:
+- **Murder Mysteries**: Requires social and physical deductive reasoning.
+- **Object Placements**: Tests observational and theory-of-mind reasoning.
+- **Team Allocations**: Focuses on social reasoning and constraint satisfaction.
+
+### Dataset Construction
+
+MuSR instances are generated using a neurosymbolic synthetic-to-natural narrative generation algorithm. This approach allows for the creation of complex reasoning instances that combine structured reasoning trees with natural language narratives, challenging both direct and nuanced inference capabilities in LLMs.
+
+MuSR's dataset consists of:
+- **Murder Mysteries**: Scenarios with suspects, motives, and opportunities requiring deductive inference.
+- **Object Placements**: Scenarios where individuals' observations inform reasoning about object locations.
+- **Team Allocations**: Scenarios that simulate social relationships and teamwork for optimal task assignments.
+
+
+### Dataset Access
+MuSR dataset is publicly available, with instructions provided on the [GitHub Project](https://github.com/Zayne-Sprague/MuSR). You can download the dataset and use pre-defined prompts or create your own configurations.
+
+### Evaluation
+
+1. Install dependencies and configure the environment.
+2. Run evaluations using `opencompass configs/eval_musr.py` to assess LLM performance.
+3. Analyze results against human performance benchmarks.
+
+### Example Command
+```bash
+opencompass configs/eval_musr.py
+```
+
+## Baselines and Results
+
+MuSR includes baseline results for multiple LLMs evaluated with chain-of-thought and advanced reasoning strategies. These benchmarks assess model accuracy on reasoning tasks across the three domains.
+
+| Domain           | Baseline Accuracy (GPT-4) | Human Performance |
+|------------------|---------------------------|--------------------|
+| Murder Mystery   | 80.4%                     | 94.1%             |
+| Object Placement | 60.9%                     | 95.0%             |
+| Team Allocation  | 68.4%                     | 100%              |
+
+
+| dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | qwen2.5-7b-instruct-turbomind | qwen2.5-14b-instruct-turbomind | yi-1.5-9b-chat-turbomind | qwen2.5-32b-instruct-turbomind | glm-4-9b-chat-turbomind | llama-3_1-8b-instruct-turbomind | ministral-8B-instruct-2410-turbomind | gemma-2-9b-it-turbomind | gemma-2-27b-it-turbomind |
+|----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | -----|
+| musr_murder_mysteries | a5ce30 | accuracy | gen | 59.20 | 63.20 | 76.00 | 68.80 | 78.80 | 71.20 | 73.60 | 73.60 | 74.80 | 77.20 |
+| musr_object_placements | a5ce30 | accuracy | gen | 54.69 | 56.25 | 57.42 | 52.73 | 66.02 | 49.22 | 57.42 | 60.94 | 60.94 | 62.11 |
+| musr_team_allocation | a5ce30 | accuracy | gen | 39.20 | 32.40 | 55.60 | 40.00 | 67.60 | 50.40 | 46.00 | 36.40 | 40.80 | 41.20 |
+| musr_average | - | naive_average | gen | 51.03 | 50.62 | 63.01 | 53.84 | 70.81 | 56.94 | 59.01 | 56.98 | 58.85 | 60.17 |
+
+
+## Citation
+
+If you use MuSR in your research, please cite:
+```bibtex
+@misc{sprague2024musrtestinglimitschainofthought,
+      title={MuSR: Testing the Limits of Chain-of-thought with Multistep Soft Reasoning}, 
+      author={Zayne Sprague and Xi Ye and Kaj Bostrom and Swarat Chaudhuri and Greg Durrett},
+      year={2024},
+      eprint={2310.16049},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2310.16049}, 
+}
+```
+
+## Details
+
+For further details, please refer to the MuSR paper [here](https://arxiv.org/abs/2310.16049).
--- a/opencompass/configs/datasets/musr/musr_gen.py
+++ b/opencompass/configs/datasets/musr/musr_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .musr_gen_3c6e15 import musr_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/musr/musr_gen_3c6e15.py
+++ b/opencompass/configs/datasets/musr/musr_gen_3c6e15.py
@ -0,0 +1,135 @@
+from opencompass.datasets import MusrDataset, MusrEvaluator
+from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer
+
+
+DATASET_CONFIGS = {
+    'murder_mysteries': {
+        'abbr': 'musr_murder_mysteries',
+        'name': 'murder_mysteries',
+        'path': 'opencompass/musr',  
+        'reader_cfg': dict(
+            input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
+            output_column='gold_answer',
+        ),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt='{system_prompt}'
+                        )
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt='{prompt}'
+                        ),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=512),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(
+                type=MusrEvaluator,
+                answer_index_modifier=1,
+                self_consistency_n=1
+            ),
+        ),
+    },
+    'object_placements': {
+        'abbr': 'musr_object_placements',
+        'name': 'object_placements',
+        'path': 'opencompass/musr',
+        'reader_cfg': dict(
+            input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
+            output_column='gold_answer',
+        ),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt='{system_prompt}'
+                        )
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt='{prompt}'
+                        ),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=512),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(
+                type=MusrEvaluator,
+                answer_index_modifier=1,
+                self_consistency_n=1
+            ),
+        ),
+    },
+    'team_allocation': {
+        'abbr': 'musr_team_allocation',
+        'name': 'team_allocation',
+        'path': 'opencompass/musr',
+        'reader_cfg': dict(
+            input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
+            output_column='gold_answer',
+        ),
+        'infer_cfg': dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt='{system_prompt}'
+                        )
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt='{prompt}'
+                        ),
+                    ]
+                ),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=512),
+        ),
+        'eval_cfg': dict(
+            evaluator=dict(
+                type=MusrEvaluator,
+                answer_index_modifier=1,
+                self_consistency_n=1
+            ),
+        ),
+    },
+}
+
+
+musr_datasets = []
+
+for config in DATASET_CONFIGS.values():
+    dataset = dict(
+        abbr=config['abbr'],
+        type=MusrDataset,
+        path=config['path'],
+        name=config['name'],
+        reader_cfg=config['reader_cfg'],
+        infer_cfg=config['infer_cfg'],
+        eval_cfg=config['eval_cfg'],
+    )
+    musr_datasets.append(dataset)
--- a/opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py
+++ b/opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py
@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='glm-4-9b-turbomind',
+        path='THUDM/glm-4-9b',
+        engine_config=dict(max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+        max_seq_len=8192,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py
+++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py
@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2.5-14b-turbomind',
+        path='Qwen/Qwen2.5-14B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+    )
+]
--- a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py
+++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py
@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2.5-32b-turbomind',
+        path='Qwen/Qwen2.5-32B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+    )
+]
--- a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py
+++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py
@ -0,0 +1,17 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2.5-72b-turbomind',
+        path='Qwen/Qwen2.5-72B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=4),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024
+        ),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
--- a/opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py
+++ b/opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py
@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='yi-1.5-9b-turbomind',
+        path='01-ai/Yi-1.5-9B',
+        engine_config=dict(session_len=4096, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+        max_seq_len=4096,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/opencompass/configs/summarizers/groups/babilong.py
+++ b/opencompass/configs/summarizers/groups/babilong.py
@ -0,0 +1,37 @@
+default_babilong_tasks = [
+    'qa1',
+    'qa2',
+    'qa3',
+    'qa4',
+    'qa5',
+    'qa6',
+    'qa7',
+    'qa8',
+    'qa9',
+    'qa10',
+]
+context_window_sizes = [
+    '0k',
+    '1k',
+    '2k',
+    '4k',
+    '8k',
+    '16k',
+    '32k',
+    '64k',
+    '128k',
+    '256k',
+    '512k',
+    '1m',
+]
+babilong_summary_groups = []
+for context_window_size in context_window_sizes:
+    babilong_summary_groups.append(
+        {
+            'name': f'babilong_{context_window_size}',
+            'subsets': [
+                f'babilong_{task}_{context_window_size}'
+                for task in default_babilong_tasks
+            ],
+        }
+    )
--- a/opencompass/configs/summarizers/groups/musr_average.py
+++ b/opencompass/configs/summarizers/groups/musr_average.py
@ -0,0 +1,19 @@
+summarizer = dict(
+    dataset_abbrs=[
+        'musr_murder_mysteries',
+        'musr_object_placements',
+        'musr_team_allocation',
+        'musr_average'
+    ],
+    summary_groups=[
+        {
+            'name': 'musr_average',
+            'subsets': [
+                'musr_murder_mysteries',
+                'musr_object_placements',
+                'musr_team_allocation',
+            ],
+        }
+    ],
+)
+ 
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -7,6 +7,7 @@ from .anthropics_evals import *  # noqa: F401, F403
 from .apps import *  # noqa: F401, F403
 from .arc import *  # noqa: F401, F403
 from .ax import *  # noqa: F401, F403
+from .babilong import *  # noqa: F401, F403
 from .bbh import *  # noqa: F401, F403
 from .boolq import *  # noqa: F401, F403
 from .bustum import *  # noqa: F401, F403
@ -86,6 +87,7 @@ from .mmlu_pro import *  # noqa: F401, F403
 from .MMLUArabic import *  # noqa: F401, F403
 from .mmmlu import *  # noqa: F401, F403
 from .multirc import *  # noqa: F401, F403
+from .musr import *  # noqa: F401, F403
 from .narrativeqa import *  # noqa: F401, F403
 from .natural_question import *  # noqa: F401, F403
 from .natural_question_cn import *  # noqa: F401, F403
--- a/opencompass/datasets/babilong/init.py
+++ b/opencompass/datasets/babilong/init.py
@ -0,0 +1 @@
+from .babilong import *  # noqa: F401, F403
--- a/opencompass/datasets/babilong/babilong.py
+++ b/opencompass/datasets/babilong/babilong.py
@ -0,0 +1,106 @@
+# flake8: noqa: F401, E501
+import json
+import os
+
+from datasets import Dataset
+
+from opencompass.datasets.babilong.babilong_utils import compare_answers
+from opencompass.datasets.babilong.prompts import (DEFAULT_PROMPTS,
+                                                   DEFAULT_TEMPLATE,
+                                                   get_formatted_input)
+from opencompass.datasets.base import BaseDataset
+from opencompass.openicl import BaseEvaluator
+from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
+
+
+@LOAD_DATASET.register_module()
+class BabiLongDataset(BaseDataset):
+
+    @staticmethod
+    def load(
+        path,
+        task,
+        split_name,
+        use_instruction=True,
+        use_examples=True,
+        use_post_prompt=True,
+    ) -> Dataset:
+
+        assert task in [
+            'qa1',
+            'qa2',
+            'qa3',
+            'qa4',
+            'qa5',
+            'qa6',
+            'qa7',
+            'qa8',
+            'qa9',
+            'qa10',
+        ], f"Task must be in ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']"
+        assert split_name in [
+            '0k',
+            '1k',
+            '2k',
+            '4k',
+            '8k',
+            '16k',
+            '32k',
+            '64k',
+            '128k',
+            '256k',
+            '512k',
+            '1m',
+        ], f"Split name must be in ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k', '256k', '512k', '1m']"
+
+        # configure the prompt
+        prompt_cfg = {
+            'instruction':
+            (DEFAULT_PROMPTS[task]['instruction'] if use_instruction else ''),
+            'examples':
+            (DEFAULT_PROMPTS[task]['examples'] if use_examples else ''),
+            'post_prompt':
+            (DEFAULT_PROMPTS[task]['post_prompt'] if use_post_prompt else ''),
+            'template':
+            DEFAULT_TEMPLATE,
+        }
+
+        path = get_data_path(path)
+        file = os.path.join(path, task, f'{split_name}.json')
+
+        with open(file, 'r') as f:
+            task_data = json.load(f)
+
+        data = []
+        for sample in task_data:
+            tmp_data = {'prompt': [], 'answer': []}
+            target = sample['target']
+            context = sample['input']
+            question = sample['question']
+
+            input_text = get_formatted_input(
+                context,
+                question,
+                prompt_cfg['examples'],
+                prompt_cfg['instruction'],
+                prompt_cfg['post_prompt'],
+                template=DEFAULT_TEMPLATE,
+            )
+
+            tmp_data['prompt'].append(input_text)
+            tmp_data['answer'].append(target)
+            data.append(tmp_data)
+        return Dataset.from_list(data)
+
+
+class BabiLongEvaluator(BaseEvaluator):
+
+    def score(self, predictions, gold):
+        assert len(predictions) == len(gold)
+        score = (sum([
+            compare_answers(str(ref[0]), pred)
+            for pred, ref in zip(predictions, gold)
+        ]) / len(predictions) * 100)
+        result = {'score': round(score, 2)}
+        return result
--- a/opencompass/datasets/babilong/babilong_utils.py
+++ b/opencompass/datasets/babilong/babilong_utils.py
@ -0,0 +1,293 @@
+# flake8: noqa: E501
+# Modifided from https://github.com/booydar/babilong/blob/main/babilong/babilong_utils.py
+import re
+
+import nltk
+import numpy as np
+import pandas as pd
+from torch.utils.data import Dataset
+
+
+def compare_answers(target, output):
+    """Compare target and output answers.
+
+    Takes only the first sentence from output and filters responses when model
+    tries to generate examples. We consider prediction correct if target is in
+    output.
+    """
+    target = target.lower()
+    output = output.lower()
+    # take only the first sentence from output
+    output = output.split('.')[0]
+    # filter responses when model tries to generate examples
+    output = output.split('<context>')[0]
+    output = output.split('<example>')[0]
+
+    # we consider prediction correct if target is in output
+    if target in output:
+        return True
+
+    return False
+
+
+def get_dataset_df(dataset_path, max_n_facts=None):
+    """Preprocess babi text files."""
+    with open(dataset_path, 'r') as f:
+        texts = f.read().strip()
+        texts = texts.split('\n')
+        df = pd.DataFrame(texts, columns=['text'])
+
+    # parse samples
+    df['phrase_num'] = df.text.apply(lambda x: int(x.split(' ')[0]))
+    df.text = df.text.apply(lambda x: x[x.index(' ') + 1:])
+    df['answer'] = df.text.apply(lambda x: x[x.index('\t') + 1:]
+                                 if '\t' in x else None)
+    df['reference_num'] = df.answer.apply(
+        lambda x: x
+        if x is None else [int(n) for n in re.split('\t| ', x)[1:]])
+    df.answer = df.answer.apply(lambda x: x if x is None else x.split('\t')[0])
+    df.text = df.text.apply(lambda x: x.split('\t')[0] if '\t' in x else x)
+
+    # mark each sample
+    sample_start_inds = list(np.where(df.phrase_num == 1)[0]) + [df.shape[0]]
+    for i, (start,
+            end) in enumerate(zip(sample_start_inds, sample_start_inds[1:])):
+        df.loc[start:end, 'initial_sample_num'] = i
+
+    df.initial_sample_num = df.initial_sample_num.astype(int)
+
+    # multiple questions in sample -> samples with single question
+    initial_samples = [
+        df[df.initial_sample_num == sn]
+        for sn in df.initial_sample_num.unique()
+    ]
+
+    single_question_slices = []
+    for sample in initial_samples:
+        answer_positions = sample[~sample.answer.isna()].index
+        slices = [sample.loc[:ans_pos].copy() for ans_pos in answer_positions]
+        for i, slc in enumerate(slices):
+            slices[i] = slc[(slc.answer.isna()) | (slc.index == slc.index[-1])]
+        if max_n_facts is not None:  # drop samples with too many facts
+            slices = [slc for slc in slices if slc.shape[0] <= max_n_facts]
+        single_question_slices += slices
+
+    df = pd.concat(single_question_slices).reset_index(drop=True)
+
+    # mark each sample again
+    sample_start_inds = list(np.where(df.phrase_num == 1)[0]) + [df.shape[0]]
+    for i, (start,
+            end) in enumerate(zip(sample_start_inds, sample_start_inds[1:])):
+        df.loc[start:end, 'sample_num'] = i
+
+    df.sample_num = df.sample_num.astype(int)
+
+    return df
+
+
+class TaskDataset(Dataset):
+    """Babi task loader dataset."""
+
+    def __init__(self, dataset_path, max_n_facts=None):
+        self.fact_dataset = get_dataset_df(dataset_path,
+                                           max_n_facts=max_n_facts)
+
+    def __getitem__(self, ind):
+        slc = self.fact_dataset[self.fact_dataset.sample_num == ind]
+        references = slc[slc.phrase_num.isin(
+            slc.reference_num.values[-1])].text.values
+        sample = {
+            'facts': slc.text.values[:-1],
+            'question': slc.text.values[-1],
+            'answer': slc.answer.values[-1],
+            'references': references,
+        }
+        return sample
+
+    def __len__(self):
+        return self.fact_dataset.sample_num.max()
+
+
+def sum_lengths(sentences):
+    return sum([len(s) for s in sentences])
+
+
+class SentenceSampler:
+    """Sampler of background text."""
+
+    def __init__(
+        self,
+        dataset,
+        tokenizer,
+        min_sentence_len=10,
+        max_sentence_len=None,
+        shuffle=False,
+        random_seed=42,
+    ):
+        self.sample_ind = 0
+        self.dataset = dataset
+        self.sentences = []
+        self.tokenizer = tokenizer
+        self.min_sentence_len = min_sentence_len
+        self.max_sentence_len = max_sentence_len
+        self.sentence_tokenizer = nltk.PunktSentenceTokenizer()
+        self.shuffle = shuffle
+        self.gen = np.random.default_rng(seed=random_seed)
+
+    def get_sample(self, sample_size):
+        sample = []
+        total_len = 0
+        while True:
+            sentences = list(self.sentences)
+            for i, sent in enumerate(
+                    sentences
+            ):  # add new sentence until sample_size is reached
+                tokenized = self.tokenizer.encode(sent,
+                                                  add_special_tokens=False)
+                if not self.length_is_ok(tokenized):
+                    continue
+                total_len += len(tokenized)
+                sample.append(tokenized)
+                if total_len >= sample_size:
+                    self.sentences = self.sentences[i + 1:]
+                    cutoff = total_len - sample_size
+                    if cutoff > 0:
+                        sample[-1] = sample[-1][:-cutoff]
+                    return sample
+
+            self.sentences = []
+            self.sample_sentences_(
+                sample_size
+            )  # appends new sentences, can be updated to just return new sentences
+
+    def sample_sentences_(self, sample_size):
+        sentences = []
+        while len(sentences) == 0:
+            text = self.next_sample_()
+            if self.shuffle:
+                if len(text) == 0:
+                    continue
+                text = text[self.gen.choice(len(
+                    text)):]  # start from random position in text
+                text = text[:sample_size *
+                            10]  # cut too long texts to speed up tokenization
+            sentences += self.sentence_tokenizer.tokenize(text)
+            if self.shuffle:
+                sentences = sentences[1:-1]
+        self.sentences += sentences
+
+    def next_sample_(self):
+        if self.shuffle:
+            self.total_tokens = 0
+            sample_ind = self.gen.choice(len(self.dataset))
+            sample = self.dataset[int(sample_ind)]['text']
+        else:
+            sample = self.dataset[int(self.sample_ind)]['text']
+            self.sample_ind += 1
+            self.sample_ind = self.sample_ind % len(self.dataset)
+        return sample
+
+    def length_is_ok(self, tokenized):
+        if (self.max_sentence_len is not None
+                and len(tokenized) > self.max_sentence_len):
+            return False
+        if (self.min_sentence_len is not None
+                and len(tokenized) < self.min_sentence_len):
+            return False
+        return True
+
+
+class NoiseInjectionDataset(Dataset):
+    """Combined dataset for noisy babi QA.
+
+    It's recommended to use sample_size >= 1024 and task_end_pct - task_start_pct >= 0.2
+    """
+
+    def __init__(
+        self,
+        task_dataset,
+        noise_sampler,
+        tokenizer,
+        task_start_pct=None,  # left border of facts in sample, between 0 and 1
+        task_end_pct=None,  # right border of facts in sample, between task_start_pct and 1
+        sample_size=1024,
+        mixed_length_ratio=0.0,  # used for mixed length curriculum, prob for shorter samples
+        random_seed=42,
+    ):
+        self.task_dataset = task_dataset
+        self.noise_sampler = noise_sampler
+        self.sample_size = sample_size
+        self.mixed_length_ratio = mixed_length_ratio
+        self.tokenizer = tokenizer
+        self.task_start_pct = task_start_pct
+        self.task_end_pct = task_end_pct
+        if random_seed:
+            self.gen = np.random.default_rng(seed=random_seed)
+
+    def __getitem__(self, ind):
+        sample = self.task_dataset[ind]
+        facts_tok = self.tokenizer(list(sample['facts']))['input_ids']
+        question_tok = self.tokenizer(sample['question'])['input_ids']
+        answer_tok = self.tokenizer(sample['answer'])['input_ids']
+
+        sample_size = self.get_sample_size()
+        task_len = sum_lengths(facts_tok)
+        background_text_len = sample_size - task_len
+        background_text = self.noise_sampler.get_sample(background_text_len)
+        sample['background_text'] = background_text
+
+        if (self.task_start_pct is None
+                and self.task_end_pct is None):  # if fact position unspecified
+            possible_positions = range(len(background_text) + 1)
+        else:
+            task_start_ind = int(sample_size * self.task_start_pct)
+            task_end_ind = int(sample_size * self.task_end_pct)
+            total_facts_len = sum_lengths(facts_tok)
+
+            possible_positions = []  # where can we insert facts?
+            current_length = 0
+            for i, text in enumerate(background_text):
+                if (current_length >= task_start_ind) and (
+                        current_length < task_end_ind - total_facts_len):
+                    possible_positions.append(i)
+                current_length += len(text)
+
+            if len(possible_positions) == 0:
+                raise IndexError(
+                    f'Unable to insert facts in specified place: {self.task_start_pct, self.task_end_pct}.'
+                    f'Total fact length: {total_facts_len}, '
+                    f'sentences length: {[len(t) for t in background_text]}. '
+                    f'Make the range wider or increase the sample size.')
+
+        fact_positions = self.gen.choice(possible_positions, len(facts_tok))
+        fact_positions.sort()
+        sample['fact_positions'] = (
+            fact_positions  # positions of facts between noise sentences
+        )
+
+        updated_sample = [[] for _ in range(len(background_text) + 1)]
+        for fact, pos in zip(facts_tok, fact_positions):
+            updated_sample[pos].append(fact)
+
+        for i, s in enumerate(background_text):
+            updated_sample[i].append(s)
+
+        flat = [i for s in updated_sample for i in s]
+        tokens = [i for s in flat for i in s]
+
+        sample['input_tokens'] = tokens
+        sample['question_tokens'] = question_tok
+        sample['target_tokens'] = answer_tok
+
+        return sample
+
+    def __len__(self):
+        return len(self.task_dataset)
+
+    def get_sample_size(self):
+        if isinstance(self.sample_size, list):
+            if self.gen.random() > self.mixed_length_ratio:
+                return self.gen.choice(self.sample_size)
+            return max(self.sample_size)
+        else:
+            return self.sample_size
--- a/opencompass/datasets/babilong/prompts.py
+++ b/opencompass/datasets/babilong/prompts.py
@ -0,0 +1,516 @@
+# flake8: noqa: E501
+SYSTEM_TEMPLATE = '{instruction}\n\n{examples}\n\n{post_prompt}'
+USER_TEMPLATE = '<context>\n{context}\n</context>\n\nQuestion: {question}'
+DEFAULT_TEMPLATE = f'{SYSTEM_TEMPLATE}\n\n{USER_TEMPLATE}'
+
+CUSTOM_SYSTEM_PROMPTS = {
+    # https://github.com/dvlab-research/LongLoRA/blob/2345c6d030f61ac3a031906386a103a5b05e0e6f/inference.py#L18
+    'LONGLORA_LLAMA2':
+    'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. '
+    'Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. '
+    'Please ensure that your responses are socially unbiased and positive in nature.\n\n'
+    'If a question does not make any sense, or is not factually coherent, explain why instead of answering '
+    'something not correct. If you don\'t know the answer to a question, please don\'t share false information.'
+}
+
+
+def get_formatted_input(
+    context,
+    question,
+    examples,
+    instruction,
+    post_prompt,
+    template=DEFAULT_TEMPLATE,
+):
+    # pre_prompt - general instruction
+    # examples - in-context examples
+    # post_prompt - any additional instructions after examples
+    # context - text to use for qa
+    # question - question to answer based on context
+    formatted_input = template.format(
+        instruction=instruction,
+        examples=examples,
+        post_prompt=post_prompt,
+        context=context.strip(),
+        question=question,
+    )
+    return formatted_input.strip()
+
+
+DEFAULT_PROMPTS = {
+    'qa1': {
+        'instruction':
+        'I will give you context with the facts about positions of different persons hidden in some random text '
+        'and a question. You need to answer the question based only on the information from the facts. '
+        'If a person was in different locations, use the latest location to answer the question.',
+        'examples':
+        '<example>\n'
+        'Charlie went to the hallway. Judith come back to the kitchen. Charlie travelled to balcony. '
+        'Where is Charlie?\n'
+        'Answer: The most recent location of Charlie is balcony.\n'
+        '</example>\n\n'
+        '<example>\n'
+        'Alan moved to the garage. Charlie went to the beach. Alan went to the shop. Rouse '
+        'travelled to balcony. Where is Alan?\n'
+        'Answer: The most recent location of Alan is shop.\n'
+        '</example>',
+        'post_prompt':
+        'Always return your answer in the following format: '
+        'The most recent location of ’person’ is ’location’. Do not write anything else after that.',
+    },
+    'qa2': {
+        'instruction':
+        'I give you context with the facts about locations and actions of different persons '
+        'hidden in some random text and a question.'
+        'You need to answer the question based only on the information from the facts.\n'
+        'If a person got an item in the first location and travelled to the second location '
+        'the item is also in the second location. '
+        'If a person dropped an item in the first location and moved to the second location '
+        'the item remains in the first location.',
+        'examples':
+        '<example>\n'
+        'Charlie went to the kitchen. Charlie got a bottle. Charlie moved to the balcony. '
+        'Where is the bottle?\n'
+        'Answer: The bottle is in the balcony.\n'
+        '</example>\n'
+        '<example>\n'
+        'Alan moved to the garage. Alan got a screw driver. Alan moved to the kitchen. Where '
+        'is the screw driver?\n'
+        'Answer: The screw driver is in the kitchen.\n'
+        '</example>',
+        'post_prompt':
+        'Always return your answer in the following format: The ’item’ is in ’location’. '
+        'Do not write anything else after that.',
+    },
+    'qa3': {
+        'instruction':
+        'I give you context with the facts about locations and actions of different persons '
+        'hidden in some random text and a question. '
+        'You need to answer the question based only on the information from the facts.\n'
+        'If a person got an item in the first location and travelled to the second location '
+        'the item is also in the second location. '
+        'If a person dropped an item in the first location and moved to the second location '
+        'the item remains in the first location.',
+        'examples':
+        '<example>\n'
+        'John journeyed to the bedroom. Mary grabbed the apple. Mary went back to the bathroom. '
+        'Daniel journeyed to the bedroom. Daniel moved to the garden. Mary travelled to the kitchen. '
+        'Where was the apple before the kitchen?\n'
+        'Answer: Before the kitchen the apple was in the bathroom.\n'
+        '</example>\n'
+        '<example>\n'
+        'John went back to the bedroom. John went back to the garden. John went back to the kitchen. '
+        'Sandra took the football. Sandra travelled to the garden. Sandra journeyed to the bedroom. '
+        'Where was the football before the bedroom?\n'
+        'Answer: Before the bedroom the football was in the garden.\n'
+        '</example>',
+        'post_prompt':
+        'Always return your answer in the following format: '
+        'Before the $location_1$ the $item$ was in the $location_2$. Do not write anything else after that.',
+    },
+    'qa4': {
+        'instruction':
+        'I will give you context with the facts about different people, their location and actions, hidden in '
+        'some random text and a question. '
+        'You need to answer the question based only on the information from the facts.',
+        'examples':
+        '<example>\n'
+        'The hallway is south of the kitchen. The bedroom is north of the kitchen. '
+        'What is the kitchen south of?\n'
+        'Answer: bedroom\n'
+        '</example>\n'
+        '<example>\n'
+        'The garden is west of the bedroom. The bedroom is west of the kitchen. What is west of the bedroom?\n'
+        'Answer: garden\n'
+        '</example>',
+        'post_prompt':
+        'Your answer should contain only one word - location. Do not write anything else after that.',
+    },
+    'qa5': {
+        'instruction':
+        'I will give you context with the facts about locations and their relations hidden in some random text '
+        'and a question. You need to answer the question based only on the information from the facts.',
+        'examples':
+        '<example>\n'
+        'Mary picked up the apple there. Mary gave the apple to Fred. Mary moved to the bedroom. '
+        'Bill took the milk there. Who did Mary give the apple to?\n'
+        'Answer: Fred\n'
+        '</example>\n'
+        '<example>\n'
+        'Jeff took the football there. Jeff passed the football to Fred. Jeff got the milk there. '
+        'Bill travelled to the bedroom. Who gave the football?\n'
+        'Answer: Jeff\n'
+        '</example>\n'
+        '<example>\n'
+        'Fred picked up the apple there. Fred handed the apple to Bill. Bill journeyed to the bedroom. '
+        'Jeff went back to the garden. What did Fred give to Bill?\n'
+        'Answer: apple\n'
+        '</example>',
+        'post_prompt':
+        'Your answer should contain only one word. Do not write anything else after that. '
+        'Do not explain your answer.',
+    },
+    'qa6': {
+        'instruction':
+        'I will give you context with the facts about people and their locations hidden in some random text and a '
+        'question. You need to answer the question based only on the information from the facts. '
+        'If a person was in different locations, use the latest location the person was in to answer the question.',
+        'examples':
+        '<example>\n'
+        'John travelled to the hallway. John travelled to the garden. Is John in the garden?\n'
+        'Answer: yes\n'
+        '</example>\n'
+        '<example>\n'
+        'Mary went to the office. Daniel journeyed to the hallway. Mary went to the bedroom. '
+        'Sandra went to the garden. Is Mary in the office?\n'
+        'Answer: no\n'
+        '</example>\n',
+        'post_prompt':
+        'Your answer should contain only one word - $yes$ or $no$. Do not write anything else after that. '
+        'Do not explain your answer.',
+    },
+    'qa7': {
+        'instruction':
+        'I will give you context with the facts about people and objects they carry, hidden in some random text '
+        'and a question. You need to answer the question based only on the information from the facts.',
+        'examples':
+        '<example>\n'
+        'Daniel went to the bedroom. Daniel got the apple there. How many objects is Daniel carrying?\n'
+        'Answer: one\n'
+        '</example>\n'
+        '<example>\n'
+        'Mary grabbed the apple there. Mary gave the apple to John. How many objects is Mary carrying?\n'
+        'Answer: none\n'
+        '</example>\n'
+        '<example>\n'
+        'Sandra travelled to the hallway. Sandra picked up the milk there. Sandra took the apple there. '
+        'Mary travelled to the garden. How many objects is Sandra carrying?\n'
+        'Answer: two\n'
+        '</example>\n',
+        'post_prompt':
+        'Your answer should contain only one word - $none$ or $number_of_objects$. '
+        'Do not write anything else after that. Do not explain your answer.',
+    },
+    'qa8': {
+        'instruction':
+        'I will give you context with the facts about people and objects they carry, hidden in some random text '
+        'and a question. You need to answer the question based only on the information from the facts.',
+        'examples':
+        '<example>\n'
+        'Sandra travelled to the garden. Mary grabbed the milk there. What is Mary carrying?\n'
+        'Answer: milk\n'
+        '</example>\n'
+        '<example>\n'
+        'Mary travelled to the kitchen. Sandra travelled to the office. John travelled to the office. '
+        'Sandra discarded the milk there. What is Sandra carrying?\n'
+        'Answer: nothing\n'
+        '</example>\n'
+        '<example>\n'
+        'Daniel grabbed the apple there. Mary went to the office. Daniel moved to the garden. '
+        'Daniel grabbed the milk there. Mary went to the kitchen. What is Daniel carrying?\n'
+        'Answer: apple,milk\n'
+        '</example>\n',
+        'post_prompt':
+        'Your answer should contain only one or two words: $nothing$ or $object$ or $object_1$, $object_2$. '
+        'Do not write anything else. Do not explain your answer.',
+    },
+    'qa9': {
+        'instruction':
+        'I will give you context with the facts about people and their locations hidden in some random text and '
+        'a question. You need to answer the question based only on the information from the facts. '
+        'If a person was in different locations, use the latest location the person was in to answer the question.',
+        'examples':
+        '<example>\n'
+        'John is not in the bathroom. Sandra is not in the bedroom. Is John in the bathroom?\n'
+        'Answer: no\n'
+        '</example>\n'
+        '<example>\n'
+        'Mary journeyed to the kitchen. John is in the bedroom. Sandra is not in the garden. '
+        'Is Mary in the kitchen?\n'
+        'Answer: yes\n'
+        '</example>\n',
+        'post_prompt':
+        'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. '
+        'Do not explain your answer.',
+    },
+    'qa10': {
+        'instruction':
+        'I will give you context with the facts about people and their locations hidden in some random text and a '
+        'question. You need to answer the question based only on the information from the facts. '
+        'If a person was in different locations, use the latest location the person was in to answer the question.',
+        'examples':
+        '<example>\n'
+        'Bill is in the kitchen. Julie is either in the school or the cinema. Is Bill in the bedroom?\n'
+        'Answer: no\n'
+        '</example>\n'
+        '<example>\n'
+        'Fred is in the bedroom. Mary is either in the school or the cinema. Is Mary in the school?\n'
+        'Answer: maybe\n'
+        '</example>\n'
+        '<example>\n'
+        'Fred is either in the kitchen or the park. Bill moved to the cinema. Is Bill in the cinema?\n'
+        'Answer: yes\n'
+        '</example>\n'
+        '<context>\n',
+        'post_prompt':
+        'Your answer should contain only one word - $yes$ or $no$ or $maybe$. Do not write anything else. '
+        'Do not explain your answer.',
+    },
+    'qa11': {
+        'instruction':
+        'I will give you context with the facts about people and their locations hidden in some random text and a '
+        'question. You need to answer the question based only on the information from the facts. '
+        'If a person was in different locations, use the latest location the person was in to answer the question.',
+        'examples':
+        '<example>\n'
+        'Daniel journeyed to the hallway. After that he journeyed to the garden. Where is Daniel?\n'
+        'Answer: garden\n'
+        '</example>\n'
+        '<example>\n'
+        'Mary moved to the office. Afterwards she journeyed to the kitchen. Daniel went to the hallway. '
+        'Then he journeyed to the garden. Where is Mary?\n'
+        'Answer: kitchen\n'
+        '</example>\n'
+        '<example>\n'
+        'Sandra moved to the kitchen. After that she went back to the hallway. Sandra moved to the bedroom. '
+        'Then she went to the hallway. Mary moved to the bedroom. Afterwards she travelled to the bathroom. '
+        'Where is Sandra?\n'
+        'Answer: hallway\n'
+        '</example>\n'
+        '<context>\n',
+        'post_prompt':
+        'Your answer should contain only one word - location. Do not write anything else after that. '
+        'Do not explain your answer.',
+    },
+    'qa12': {
+        'instruction':
+        'I will give you context with the facts about people and their locations hidden in some random text and a '
+        'question. You need to answer the question based only on the information from the facts. '
+        'If a person was in different locations, use the latest location the person was in to answer the question.',
+        'examples':
+        '<example>\n'
+        'Mary and Daniel travelled to the bathroom. John and Daniel travelled to the office. Where is Daniel?\n'
+        'Answer: office\n'
+        '</example>\n'
+        '<example>\n'
+        'Sandra and Mary went back to the office. Daniel and Sandra went to the bedroom. Sandra and Mary travelled to the hallway. '
+        'John and Mary went to the kitchen. Where is Mary?\n'
+        'Answer: kitchen\n'
+        '</example>\n'
+        '<example>\n'
+        'Daniel and Sandra went back to the hallway. Daniel and John moved to the office. Daniel and John moved to the garden. '
+        'Daniel and Mary went back to the bathroom. Daniel and John went back to the kitchen. Daniel and Sandra went to the bathroom. '
+        'Where is John?\n'
+        'Answer: kitchen\n'
+        '</example>\n'
+        '<context>\n',
+        'post_prompt':
+        'Your answer should contain only one word - location. Do not write anything else after that. '
+        'Do not explain your answer.',
+    },
+    'qa13': {
+        'instruction':
+        'I will give you context with the facts about people and their locations hidden in some random text and a '
+        'question. You need to answer the question based only on the information from the facts. '
+        'If a person was in different locations, use the latest location the person was in to answer the question.',
+        'examples':
+        '<example>\n'
+        'Mary and Daniel travelled to the bathroom. Then they journeyed to the hallway. Where is Daniel?\n'
+        'Answer: hallway\n'
+        '</example>\n'
+        '<example>\n'
+        'Daniel and Sandra travelled to the kitchen. After that they journeyed to the hallway. Mary and Daniel travelled to the bedroom. '
+        'After that they travelled to the hallway. Where is Sandra?\n'
+        'Answer: hallway\n'
+        '</example>\n'
+        '<example>\n'
+        'John and Mary moved to the bathroom. Then they travelled to the office. John and Mary went to the kitchen. '
+        'Afterwards they went to the bedroom. John and Sandra moved to the bathroom. Following that they went back to the kitchen. '
+        'Where is Mary?\n'
+        'Answer: bedroom\n'
+        '</example>\n'
+        '<context>\n',
+        'post_prompt':
+        'Your answer should contain only one word - location. Do not write anything else after that. '
+        'Do not explain your answer.',
+    },
+    'qa14': {
+        'instruction':
+        'I will give you context with the facts about people and their locations hidden in some random text and a '
+        'question. You need to answer the question based only on the information from the facts. '
+        'If a person was in different locations, use the latest location the person was in to answer the question.',
+        'examples':
+        '<example>\n'
+        'Bill went back to the cinema yesterday. Julie went to the school this morning. Fred went to the park yesterday. '
+        'Yesterday Julie went to the office. Where was Julie before the school?\n'
+        'Answer: office\n'
+        '</example>\n'
+        '<example>\n'
+        'This morning Fred went to the kitchen. Fred journeyed to the bedroom yesterday. Mary travelled to the bedroom this morning. '
+        'Yesterday Mary went to the cinema. Where was Mary before the bedroom?\n'
+        'Answer: cinema\n'
+        '</example>\n'
+        '<example>\n'
+        'Yesterday Julie went back to the park. Julie went to the bedroom this morning. Bill journeyed to the cinema yesterday. '
+        'This morning Bill went back to the park. This evening Julie went to the school. This afternoon Julie went back to the park. '
+        'Where was Julie before the bedroom?\n'
+        'Answer: park\n'
+        '</example>\n'
+        '<context>\n',
+        'post_prompt':
+        'Your answer should contain only one word - location. Do not write anything else after that. '
+        'Do not explain your answer.',
+    },
+    'qa15': {
+        'instruction':
+        'I will give you context with the facts about animals, their names and relations. The facts and a question '
+        'are hidden in some random text. You need to answer the question based only on the information from the facts.',
+        'examples':
+        '<example>\n'
+        'Mice are afraid of wolves. Gertrude is a mouse. Cats are afraid of sheep. '
+        'Winona is a mouse. Sheep are afraid of wolves. Emily is a mouse. Jessica is a wolf. '
+        'What is gertrude afraid of?\n'
+        'Answer: wolf\n'
+        '</example>\n'
+        '<example>\n'
+        'Mice are afraid of wolves. Gertrude is a mouse. Cats are afraid of sheep. '
+        'Winona is a mouse. Sheep are afraid of wolves. Emily is a mouse. Jessica is a wolf. '
+        'What is jessica afraid of?\n'
+        'Answer: cat\n'
+        '</example>\n'
+        '<example>\n'
+        'Mice are afraid of cats. Wolves are afraid of sheep. Emily is a wolf. '
+        'Cats are afraid of sheep. Gertrude is a wolf. Sheep are afraid of cats. Winona is a wolf. '
+        'What is emily afraid of?\n'
+        'Answer: sheep\n'
+        '</example>\n'
+        '<context>\n',
+        'post_prompt':
+        'Your answer should contain only one word - an animal species. Do not write anything else after that. '
+        'Do not explain your answer.',
+    },
+    'qa16': {
+        'instruction':
+        'I will give you context with the facts about animals, their names and colors. The facts and a question '
+        'are hidden in some random text. You need to answer the question based only on the information from the facts.',
+        'examples':
+        '<example>\n'
+        'Lily is a frog. Bernhard is a frog. Bernhard is green. Brian is a lion. Brian is white. '
+        'Julius is a swan. Julius is green. Lily is green. Greg is a swan. What color is Greg?\n'
+        'Answer: green\n'
+        '</example>\n'
+        '<example>\n'
+        'Julius is a lion. Lily is a rhino. Bernhard is a swan. Lily is white. Bernhard is green. '
+        'Greg is a rhino. Greg is gray. Julius is white. Brian is a lion. What color is Brian?\n'
+        'Answer: white\n'
+        '</example>\n'
+        '<example>\n'
+        'Brian is a rhino. Julius is a lion. Bernhard is a lion. Greg is a swan. Brian is gray. '
+        'Greg is white. Lily is a rhino. Bernhard is yellow. Lily is gray. What color is Julius?\n'
+        'Answer: yellow\n'
+        '</example>\n'
+        '<context>\n',
+        'post_prompt':
+        'Your answer should contain only one word - a color. Do not write anything else after that. '
+        'Do not explain your answer.',
+    },
+    'qa17': {
+        'instruction':
+        'I will give you context with the facts about different figures, their location and colors, hidden in '
+        'some random text and a question. '
+        'You need to answer the question based only on the information from the facts.',
+        'examples':
+        '<example>\n'
+        'The triangle is above the pink rectangle. The blue square is to the left of the triangle. '
+        'Is the pink rectangle to the right of the blue square?\n'
+        'Answer: yes\n'
+        '</example>\n'
+        '<example>\n'
+        'The red sphere is to the left of the yellow square. The red sphere is below the pink rectangle. '
+        'Is the pink rectangle to the left of the yellow square?\n'
+        'Answer: yes\n'
+        '</example>'
+        '<example>\n'
+        'The red sphere is above the pink rectangle. The red sphere is to the right of the red square. '
+        'Is the pink rectangle above the red square?\n'
+        'Answer: no\n'
+        '</example>',
+        'post_prompt':
+        'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. '
+        'Do not explain your answer.',
+    },
+    'qa18': {
+        'instruction':
+        'I will give you context with the facts about different objects and their sizes, hidden in '
+        'some random text and a question. '
+        'You need to answer the question based only on the information from the facts.',
+        'examples':
+        '<example>\n'
+        'The box of chocolates fits inside the chest. The box is bigger than the chest. The box is bigger than the suitcase. '
+        'The suitcase fits inside the box. The container is bigger than the box of chocolates. Does the box fit in the box of chocolates?\n'
+        'Answer: no\n'
+        '</example>\n'
+        '<example>\n'
+        'The suitcase is bigger than the container. The container fits inside the box. The chest is bigger than the chocolate.'
+        'The suitcase fits inside the box. The chest fits inside the box. Does the chocolate fit in the box?\n'
+        'Answer: yes\n'
+        '</example>'
+        '<example>\n'
+        'The chocolate fits inside the box of chocolates. The suitcase fits inside the box. The chocolate fits inside the box. '
+        'The box is bigger than the box of chocolates. The suitcase is bigger than the box of chocolates. Is the chocolate bigger than the box?\n'
+        'Answer: no\n'
+        '</example>',
+        'post_prompt':
+        'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. '
+        'Do not explain your answer.',
+    },
+    'qa19': {
+        'instruction':
+        'I will give you context with the facts about different places and their locations, hidden in '
+        'some random text and a question. '
+        'You need to answer the question based only on the information from the facts.',
+        'examples':
+        '<example>\n'
+        'The office is east of the hallway. The kitchen is north of the office. The garden is west of the bedroom. '
+        'The office is west of the garden. The bathroom is north of the garden. How do you go from the kitchen to the garden?\n'
+        'Answer: s,e\n'
+        '</example>\n'
+        '<example>\n'
+        'The bedroom is west of the hallway. The office is east of the garden. The garden is north of the kitchen. '
+        'The kitchen is north of the bathroom. The hallway is west of the garden. How do you go from the kitchen to the hallway?\n'
+        'Answer: n,w\n'
+        '</example>\n'
+        '<example>\n'
+        'The bedroom is south of the hallway. The bathroom is east of the office. The kitchen is west of the garden. '
+        'The garden is south of the office. The office is south of the bedroom. How do you go from the garden to the bedroom?\n'
+        'Answer: n,n\n'
+        '</example>\n',
+        'post_prompt':
+        'Your answer should contain only two letters, separated by a comma - ordinal directions. You can choose the letters from '
+        '$n$, $s$, $e$ and $w$. Do not write anything else after that.',
+    },
+    'qa20': {
+        'instruction':
+        'I will give you context with the facts about people, their locations and condition hidden in some random text and a '
+        'question. You need to answer the question based only on the information from the facts. '
+        'If a person was in different locations, use the latest location the person was in to answer the question.',
+        'examples':
+        '<example>\n'
+        'Sumit is tired. Where will sumit go?\n'
+        'Answer: bedroom\n'
+        '</example>\n'
+        '<example>\n'
+        'Yann is hungry. Yann journeyed to the kitchen. Why did yann go to the kitchen?\n'
+        'Answer: hungry\n'
+        '</example>\n'
+        '<example>\n'
+        'Antoine is thirsty. Yann is tired. Yann went back to the bedroom. Yann picked up the pajamas there.'
+        'Jason is thirsty. Antoine went back to the kitchen. Why did antoine go to the kitchen?\n'
+        'Answer: thirsty\n'
+        '</example>\n'
+        '<context>\n',
+        'post_prompt':
+        'Your answer should contain only one word - a person condition or a place. Do not write anything else after that. '
+        'Do not explain your answer.',
+    },
+}
--- a/opencompass/datasets/math.py
+++ b/opencompass/datasets/math.py
@ -1,4 +1,5 @@
 import json
+import os
 import re
 from os import environ

@ -140,7 +141,7 @@ def extract_answer(response_text: str):
 class MATHDataset(BaseDataset):

    @staticmethod
-    def load(path: str):
+    def load(path: str, file_name: str = 'math.json'):
        path = get_data_path(path)
        dataset = DatasetDict()
        raw_data = []
@ -155,7 +156,8 @@ class MATHDataset(BaseDataset):
                    extract_boxed_answer(item['solution'])
                })
        else:
-            data = json.load(open(path))
+            file_path = os.path.join(path, file_name)
+            data = json.load(open(file_path))
            for i in data.keys():
                raw_data.append({
                    'problem':
--- a/opencompass/datasets/musr/init.py
+++ b/opencompass/datasets/musr/init.py
@ -0,0 +1 @@
+from .musr import *  # noqa: F401, F403
--- a/opencompass/datasets/musr/murder_mystery_solved_ex.py
+++ b/opencompass/datasets/musr/murder_mystery_solved_ex.py
@ -0,0 +1,81 @@
+# flake8: noqa: E501
+story = """
+In the smoke-filled haze of a thriving jazz club, Alice met her explosive end, leaving Detective Winston to sift through the suspects: Eugene, the shy pianist, and Gabrielle, the sassy club singer.
+
+While seated at his desk at the precinct, Winston received a phone call from a certain observant local bartender, tipping off the police about a harsh row peaking in a nearby jazz club. He signaled to his partner as they promptly dispatched to the scene, already ringing with sirens and a restless crowd.
+
+With the police line restraining the horde, the jazz club was undergoing a full round-up as Winston approached the informative bartender. The bartender was engrossed in his account to the officers about a raucous, punch throwing fight Eugene was part of, to his best recollection. Winston remembered Eugene, a jazz fanatic—lurking around the jazz corners more often than anyone else could recount.
+
+In the heart of the upheaval, lay a woman sprawled on the floor, later identified as Alice, a frequent face at the jazz scene and a financial analyst deeply engrossed in financial transactions. In public, Alice had made her concerns known about her discovery of fraudulent transactions at the bank, promising to report the same to the authorities. Eugene, remembered conspicuously for being a bank teller at the same bank Alice worked at, suddenly seemed closely linked.
+
+Eugene’s arrest was far from hushed, with the local news broadcasting the progressing drama live, catching sight of Eugene curtailed in handcuffs. Concurrently, it was ascertained—Eugene was a member of the jazz club. This evidence backed by a jazz club membership card retrieved from his wallet during the arrest.
+
+Just a few steps away, he noticed a man in a suit, the bouncer, a calm figure amid the bedlam. In their conversation, the bouncer corroborated that he had indeed seen Eugene involved in a heated scuffle, landing a few punches. The whisperings were starting to gain momentum, since Eugene was believed to be on the losing end of a lawsuit—a battle courtesy of Alice charging Eugene with the financial fraud she had publicly vowed to expose.
+
+Eugene was known for his frequent presence at the jazz club and on top of that, was an actual member. Therefore, it was hardly a leap to presume Alice meeting her untimely end at the club was no mere happenstance. The jazz club, despite its dim lights and pulsating music, was a public place easily accessible to all, including potential suspects like Eugene and, sadly, the ill-starred Alice.
+
+Det. Winston knew he was now tasked with a cryptic puzzle. A bank teller, embroiled in suspected fraud and a lawsuit, a jazz club murder scene and a local financial analyst—all woven into a ghastly murder mystery. He sighed in distaste as Eugene was escorted away—a man still oblivious to the chain of events waiting for him. But Winston knew, the night had only just begun for him.
+
+Winston stared down at the crumpled microphone on the floor. He picked it up gingerly, turning it in his hand. The club was in disarray, debris scattered like confetti. The lab boys were still picking pieces of the grenade apart.
+
+"Gabrielle's microphone," the coroner confirmed, barely looking up from his task.
+
+"Give him the once-over for evidence," Winston said, handing the microphone to a nearby officer.
+
+Leaving the club behind him, Winston sighed heavily. The world of jazz had taken a dark turn that night. Alice, the acclaimed critic with her sarcastic wit and keen critical eye, had been last seen alive here. Her purse lay in the club untouched, a testament to the abruptness of the event.
+
+Gabrielle had been working as a war correspondent. Winston had read her articles. They were richly detailed, passionate, and highlighted the harsh reality of war zones. Gabrielle hadn't been shy about sharing her experiences or publicly criticizing the military in her pieces. She boldly interviewed military personnel and spent extended periods in conflict zones.
+
+Alice, though, never missed a chance to pick apart Gabrielle's articles. The vitriolic snippets in Alice’s column were regular features and Gabrielle's staunch defense of her articles, her work in the jazz scene, did little against Alice's respected reputation.
+
+The tension between them was palpable. Alice had been awarded a major journalist award that Gabrielle had desired. This only deepened their rivalry, with Gabrielle feeling overlooked for this recognition in the Jazz scene.
+
+Winston cast his gaze over the club once more—a hub of pulsating rhythms now eerily silent.
+
+A significant part of the evening was Gabrielle's recorded interview with Alice. It played on the local radio, their professional rivalry subtly echoing under their professional demeanor.
+
+With a deep breath, Winston knew he had a tall task ahead. The jazz club, where Alice was last seen alive was now shrouded in an eerie silence, the vibrant rhythms of what used to be a lively night echoing in the abandoned stage. It was up to him to piece together the missing notes and bring the symphony of this unsolved case to a satisfying finale.
+
+Who is the most likely murderer?
+
+Pick one of the following choices:
+1 - Eugene
+2 - Gabrielle
+
+You must pick one option. Before selecting a choice, explain your reasoning step by step. The murderer needs to have a means (access to weapon), motive (reason to kill the victim), and opportunity (access to crime scene) in order to have killed the victim. Innocent suspects may have two of these proven, but not all three. An innocent suspect may be suspicious for some other reason, but they will not have all of motive, means, and opportunity established.
+
+If you believe that both suspects have motive, means, and opportunity, you should make an educated guess pick the one for whom these are best established. If you believe that neither suspect has all three established, then choose the suspect where these are most clearly established. Explain your reasoning step by step before you answer. Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"
+""".strip()
+
+reasoning = """
+Let's break this down step-by-step by first deducing which of the two suspects has a means, motive, and opportunity.
+
+We will start with Eugene.
+
+Eugene was being sued by Alice for fraudulent transactions.  The charge was also very public.  Both of these facts point to Eugene having a strong motive.
+
+Because Eugene has a jazz club membership, and we can deduce that the jazz club membership belongs to the same club Alice was murdered in, we can assume Eugene has an opportunity to commit the crime.
+
+Although we know Eugene is aggressive because he was throwing punches in the story, we do not know if he has access to the murder weapon.  Because he does not have access to a grenade, he does not have a means.
+
+Let's review Gabrielle next.
+
+Gabrielle's purse was found at the scene of the crime, and we can then assume she had the opportunity to kill Alice.
+
+Because Gabrielle has been in conflict zones with military personnel, it's possible that she has access to a grenade.  We can say that Gabrielle has a potential means to kill the victim.
+
+Finally, it appears that Gabrielle and Alice had a rivalry over journalism, which could have boiled up into physical action.  Because of this, we can say that Gabrielle has a potential motive to kill the victim.
+
+Now, reviewing the evidence, we see that:
+
+Eugene has a motive and opportunity but no means.
+Gabrielle has a motive, means, and opportunity.
+
+Therefore, Gabrielle is the most likely murderer.
+
+ANSWER: 2
+
+
+""".strip()
+
+murder_mystery_solved_ex = f'{story}\n\n{reasoning}'
--- a/opencompass/datasets/musr/musr.py
+++ b/opencompass/datasets/musr/musr.py
@ -0,0 +1,309 @@
+# flake8: noqa: E501
+import json
+import os.path as osp
+
+from datasets import Dataset
+
+from opencompass.datasets.base import BaseDataset
+from opencompass.openicl import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+from opencompass.utils import get_data_path
+
+from .murder_mystery_solved_ex import murder_mystery_solved_ex
+from .object_placements_solved_ex import object_placements_solved_ex
+from .team_allocation_solved_ex import team_allocation_solved_ex
+from .tree import LogicTree
+
+DATASET_CONFIGS = {
+    'murder_mysteries': {
+        'file_name':
+        'murder_mysteries.json',
+        'ex':
+        murder_mystery_solved_ex,  # write user example here
+        'system_prompt':
+        'You are a helpful assistant that will answer the questions given by the user.',
+        'hint':
+        ('Before selecting a choice, explain your reasoning step by step. '
+         'The murderer needs to have a means (access to weapon), motive (reason to kill the victim), '
+         'and opportunity (access to crime scene) in order to have killed the victim. '
+         'Innocent suspects may have two of these proven, but not all three. '
+         'An innocent suspect may be suspicious for some other reason, but they will not have all of motive, '
+         'means, and opportunity established.\n\n'
+         'If you believe that both suspects have motive, means, and opportunity, you should make an educated guess '
+         'and pick the one for whom these are best established. If you believe that neither suspect has all '
+         'three established, then choose the suspect where these are most clearly established.'
+         ),
+        'hint_before_question':
+        False,
+        'answer_index_modifier':
+        1
+    },
+    'object_placements': {
+        'file_name':
+        'object_placements.json',
+        'ex':
+        object_placements_solved_ex,
+        'skip_ablated':
+        True,
+        'ablation_depth_modifier':
+        2,
+        'system_prompt':
+        'You are a helpful assistant that will answer the questions given by the user.',
+        'hint':
+        ('Based on this story, we want to identify where someone believes that a certain object is at the end of '
+         'the story. In order to do that, you need to read the story and keep track of where they think the object '
+         'is at each point. When an object is moved, the person may observe its new location if they saw it move.\n\n'
+         'To see where an object ends up, they must be able to see the location that it moves to and not be too '
+         'distracted by what they are doing. If they do not observe the object moving, then they will still believe '
+         'it to be in the last location where they observed it.'),
+        'hint_before_question':
+        True,
+        'answer_index_modifier':
+        1
+    },
+    'team_allocation': {
+        'file_name':
+        'team_allocation.json',
+        'ex':
+        team_allocation_solved_ex,
+        'system_prompt':
+        'You are a helpful assistant that will answer the questions given by the user.',
+        'hint':
+        ('The story should allow you to determine how good each person is at a skill. Roughly, each person is '
+         'either great, acceptable, or bad at a task. We want to find an optimal assignment of people to tasks '
+         'that uses their skills as well as possible. In addition, one task will have to have two people assigned '
+         'to it. The effectiveness of their teamwork (great team, acceptable team, or bad team) also impacts the '
+         'overall quality of the assignment.\n\n'
+         'When two people need to work on a task and one is bad at it, they don\'t necessarily benefit from the '
+         'other person being good, unless they work well together.\n\n'
+         'With different strengths, weaknesses, and interpersonal dynamics at play, you should allocate your team '
+         'to find the single assignment to ensure that the tasks overall are completed as effectively as possible.'
+         ),
+        'hint_before_question':
+        False,
+        'answer_index_modifier':
+        1
+    }
+}
+
+
+@LOAD_DATASET.register_module()
+class MusrDataset(BaseDataset):
+    """MuSR.
+
+    Args:
+        path (str): path to dataset
+        name (str): name of dataset
+        self_consistency_n (int)
+        exclude_contrastive_examples (bool): Whether to exclude contrastive examples
+        reverse_contrastive_sample (bool): Whether to reverse the selection of contrastive samples
+        skip_ablated (bool): Whether to skip ablated samples
+        offset (int): Starting offset for the dataset
+        sample_size (int): Sample size, None indicates using the entire dataset.
+    """
+
+    @staticmethod
+    def load(path,
+             name,
+             self_consistency_n=1,
+             exclude_contrastive_examples=False,
+             reverse_contrastive_sample=False,
+             skip_ablated=False,
+             randomize=False,
+             offset=0,
+             sample_size=None,
+             **kwargs):
+        """Load the dataset and flatten fields while constructing prompts,
+        taking self_consistency_n and ablations into account."""
+
+        if name not in DATASET_CONFIGS:
+            raise ValueError(
+                f'Dataset name {name} not supported. Must be one of {list(DATASET_CONFIGS.keys())}'
+            )
+
+        config = DATASET_CONFIGS[name]
+        path = get_data_path(path)
+        file_path = osp.join(path, config['file_name'])
+
+        with open(file_path, 'r', encoding='utf-8') as f:
+            dataset = json.load(f)
+
+        filtered_dataset = []
+        hashes_done = []
+
+        for example in dataset:
+            if exclude_contrastive_examples and example['questions'][0].get('intermediate_data') and \
+               len(example['questions'][0].get('intermediate_data')) > 0 and \
+               example['questions'][0]['intermediate_data'][0].get('story_hash_id'):
+                story_hash = example['questions'][0]['intermediate_data'][0][
+                    'story_hash_id']
+                if story_hash in hashes_done:
+                    if reverse_contrastive_sample:
+                        filtered_dataset.append(example)
+                    else:
+                        continue
+                elif not reverse_contrastive_sample:
+                    filtered_dataset.append(example)
+                hashes_done.append(story_hash)
+            else:
+                filtered_dataset.append(example)
+
+        filtered_dataset = filtered_dataset[
+            offset:offset +
+            min(len(filtered_dataset), sample_size) if sample_size else None]
+
+        ablations = [
+            # {'prompt': 'regular', 'name': 'regular'},
+            # {'prompt': 'cot', 'name': 'cot'},
+            {
+                'prompt': 'cot+',
+                'name': 'cot+'
+            },
+        ]
+
+        # create prompts
+        flattened_data = []
+        for example in filtered_dataset:
+            context = example['context']
+            questions = example['questions']
+
+            for question in questions:
+                choices_list = question['choices']
+                choices_str = '\n'.join([
+                    f'{idx + 1} - {choice}'
+                    for idx, choice in enumerate(choices_list)
+                ])
+                gold_answer = question['answer'] + config.get(
+                    'answer_index_modifier', 1)
+
+                for ablation in ablations:
+                    prompt_style = ablation.get('prompt', 'cot+')
+                    ablation_name = ablation.get('name', 'cot+')
+
+                    for scidx in range(self_consistency_n):
+                        ex_str = ''
+                        if ablation.get('use_example') and config.get('ex'):
+                            ex_str = (
+                                'Here is an example of solving the task:\n\n' +
+                                config.get('ex') +
+                                '\n\nThis is the end of the example. The real task is below.\n\n---\n\n'
+                            )
+
+                        if prompt_style == 'regular':
+                            prompt = f'{ex_str}{context}\n\n{question["question"]}\n\n' \
+                                     f'Pick one of the following choices:\n{choices_str}\n\n' \
+                                     'You must pick one option. Finally, the last thing you generate should be "ANSWER: (your answer here, include the choice number)"'
+                        elif prompt_style == 'cot':
+                            prompt = f'{ex_str}{context}\n\n{question["question"]}\n\n' \
+                                     f'Pick one of the following choices:\n{choices_str}\n\n' \
+                                     'You must pick one option. Explain your reasoning step by step before you answer. ' \
+                                     'Finally, the last thing you generate should be "ANSWER: (your answer here, include the choice number)"'
+                        elif prompt_style == 'cot+':
+                            if config.get('hint_before_question'):
+                                prompt = f'{ex_str}{context}\n\n{config["hint"]}\n\n{question["question"]}\n\n' \
+                                         f'Pick one of the following choices:\n{choices_str}\n\n' \
+                                         'You must pick one option. Explain your reasoning step by step before you answer. ' \
+                                         'Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"'
+                            else:
+                                prompt = f'{ex_str}{context}\n\n{question["question"]}\n\n' \
+                                         f'Pick one of the following choices:\n{choices_str}\n\n' \
+                                         f'You must pick one option. {config["hint"]} Explain your reasoning step by step before you answer. ' \
+                                         'Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"'
+                        else:
+                            if len(question['intermediate_trees']
+                                   ) == 0 or config.get('skip_ablated', False):
+                                continue
+
+                            prompt = f'{ex_str}Answer the following questions given the list of facts per answer choice.\n\n'
+                            for c, t in zip(choices_str.split('\n'),
+                                            question['intermediate_trees']):
+                                # extract facts from intermediate_trees
+                                facts = list(
+                                    set([
+                                        x.value for x in
+                                        LogicTree.from_json(t).get_facts(
+                                            include_cs=ablation.get(
+                                                'include_cs', False),
+                                            include_deductions_from_level=-1,
+                                            no_facts_after_depth=ablation.get(
+                                                'no_facts_after_depth', 3) +
+                                            config.get(
+                                                'ablation_depth_modifier', 0))
+                                    ]))
+                                if config.get('allow_sorted_facts', True):
+                                    facts = sorted(facts)
+                                facts_str = '\n'.join(
+                                    [f'- {fact}' for fact in facts])
+                                prompt += f'Facts for Choice {c}:\n{facts_str}\n\n'
+                            prompt += f'Given the list of facts per answer choice, answer the following question\n\n' \
+                                      f'{question["question"]}\n\n' \
+                                      f'Pick one of the following choices:\n{choices_str}\n\n' \
+                                      'You must pick one option. After you have found the answer, say it in this format "ANSWER: (your answer here, include the choice number)"'
+
+                        flattened_example = {
+                            'context':
+                            context,
+                            'question_text':
+                            question['question'],
+                            'question':
+                            question,
+                            'answer':
+                            question['answer'],
+                            'choices':
+                            choices_list,
+                            'choices_str':
+                            choices_str,
+                            'intermediate_trees':
+                            question.get('intermediate_trees', []),
+                            'intermediate_data':
+                            question.get('intermediate_data', []),
+                            'prompt':
+                            prompt,
+                            'system_prompt':
+                            config.get('system_prompt', ''),
+                            'gold_answer':
+                            gold_answer,
+                            'scidx':
+                            scidx,  # self-consistency index
+                            'self_consistency_n':
+                            self_consistency_n,
+                            'ablation_name':
+                            ablation_name,
+                        }
+                        flattened_data.append(flattened_example)
+
+        dataset = Dataset.from_list(flattened_data)
+
+        return dataset
+
+
+@ICL_EVALUATORS.register_module()
+class MusrEvaluator(BaseEvaluator):
+
+    def __init__(self, answer_index_modifier=1, self_consistency_n=1):
+        self.answer_index_modifier = answer_index_modifier
+        self.self_consistency_n = self_consistency_n
+
+    def score(self, predictions, references):
+        correct = 0
+        assert len(predictions) == len(
+            references
+        ), 'Predictions and references must have the same length!'
+
+        total = len(predictions)
+
+        for pred, ref in zip(predictions, references):
+            if 'ANSWER:' in pred:
+                answer_line = [
+                    line for line in pred.split('\n') if 'ANSWER:' in line
+                ]
+                if answer_line:
+                    answer = answer_line[0].split('ANSWER:')[-1].strip()
+                    import re
+                    match = re.search(r'\d+', answer)
+                    if match:
+                        pred_answer = int(match.group())
+                        if pred_answer == ref:
+                            correct += 1
+        accuracy = 100 * correct / total if total > 0 else 0
+        return {'accuracy': accuracy}
--- a/opencompass/datasets/musr/object_placements_solved_ex.py
+++ b/opencompass/datasets/musr/object_placements_solved_ex.py
@ -0,0 +1,53 @@
+# flake8: noqa: E501
+story = '''
+Petra, the dedicated housewife, felt a thrill at the thought of her surprise anniversary dinner for her husband, Daniel. She had been skillfully maneuvering around Daniel's eagerness to pitch in without disappointing him or giving up her surprise.
+
+Daniel, ever-the-observant-husband, noted Petra's unusual enthusiasm about the day's menu. Despite not knowing the details, he appreciated her effort and wanted to help—silently, he decided to deploy his best skill—patiently awaiting his moment to help, maybe when Petra asked for something from the pantry. Amidst the excitement, there was Clara, their maid—ever diligent and efficient, trying to keep the surroundings perfect for this special evening.
+
+Tucked away, under the counter, was Petra's secret recipe book, her culinary treasure. Her solace in confusing times, her secret weapon during their flavorful adventures. While toward the back of the pantry, was the glass jar of Petra's favorite spice blends—something that Daniel was well aware of, in case an opportunity arose for him to assist or distract when Petra might need it.
+
+All three residents of the home were aware of each item's location. The secret recipe book under the counter, the glass jar in the pantry, and the anxious excitement that filled the air—a fragrance even more intoxicating than the delicious smells that would soon fill the kitchen.
+
+With tact and secrecy, Petra relocated her cherished recipe book from its hidden spot under the counter to its temporary home on the kitchen table. The pages were swiftly opened to reveal her secret recipes which she was eager to start preparing for the long-awaited anniversary surprise. While Petra was engrossed in her preparations, Clara continued her sweeping routine in the kitchen. Clara's steady broom strokes on the wooden floor echoed a little in the otherwise busy and humming kitchen. In the background, beyond the kitchen door, Daniel could be seen in the dining room, meticulously setting the table for the anticipated special dinner.
+
+The placement of the rooms allowed Clara to easily notice Petra's movements in her peripheral vision while she was executing her chores. Every move Petra made was observed in Clara's line of sight. Simultaneously, separated by the walls, Daniel was diligently arranging the tableware in the dining room which was separate from Petra's bustling kitchen.
+
+Hoping to spruce up the setting, Daniel delicately relocated a glass jar filled with decorative pebbles to the center of the dining table. His subtle contribution for the evening - a perfectly presentable table for their special anniversary dinner. Amidst the flurry of the special day's preparations, Clara diligently carried on with her duties in the upstairs bathroom, unseen from the dining room. Meanwhile, Petra was wholly engrossed in the allure of a new recipe in her cherished, hidden book which lay opened on the kitchen island, away from prying eyes of the dining room.
+
+In the middle of her usual tidying, Clara spotted Petra's treasured recipe book on the kitchen table. Ensuring it stayed clandestine, Clara carefully transferred it back to its usual hideaway spot beneath the counter. In the midst of the anniversary excitement, Clara deftly transferred Petra's secret weapon back to its hiding place when Daniel stepped out into the garage to retrieve extra utensils. Performing her duty with a sense of urgency, she made sure to move quietly to not disturb Petra, who was engrossed in the process of boiling a massive pot of pasta water on the stove.
+
+Despite the commotion and fervor in the kitchen, the hubbub did not stretch as far as the garage, which remained undisturbed by the domestic activity occurring in the main part of the house. Meanwhile, in the kitchen, Petra was oblivious to Clara's subtle maneuver while she busied herself at the stove, focused on making sure the large pot of water reached the perfect boil.
+
+In the end, the careful orchestration of duties by each individual within the house concluded in a harmonious anniversary celebration. The marks of a successful evening consisted of a delectable meal, a serene atmosphere, and the memory of a smooth, incident-free evening where everyone played their role to perfection.
+
+Based on this story, we want to identify where someone believes that a certain object is at the end of the story. In order to do that, you need to read the story and keep track of where they think the object is at each point. When an object is moved, the person may observe its new location if they saw it move.
+
+To see where an object ends up, they must be able to see the location that it moves to and not be too distracted by what they are doing. If they do not observe the object moving, then they will still believe it to be in the last location where they observed it.
+
+Which location is the most likely place Clara would look to find the glass jar given the story?
+
+Pick one of the following choices:
+1 - dining table
+2 - kitchen table
+3 - pantry
+4 - under counter
+
+You must pick one option. Explain your reasoning step by step before you answer. Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"
+'''.strip()
+
+reasoning = '''
+Let's solve this by thinking step-by-step. We want to know where Clara will check to find the glass jar, so let's track where Clara sees the glass jar throughout the story.
+
+At the beginning of the story, it is stated that "All three residents of the home were aware of each item's location... the glass jar in the pantry." From this, we can conclude that the first place in the story where Clara sees the glass jar is in the pantry.
+
+Throughout the story, the glass jar only moves once to the dining table. However, while Daniel was moving the glass jar, Clara was upstairs in the restroom carrying out her duties. It's highly unlikely that she saw Daniel move the glass jar, so we can assume that she still believes it to be in the pantry.
+
+Clara does go to the kitchen in the story and moves a recipe book from the kitchen table, but because it's the kitchen table and not the dining room table, we can assume she hasn't seen the glass jar there.
+
+Now, given the story and evidence, we can assume that Clara believes the glass jar to be in the pantry.
+
+ANSWER: 3
+
+'''.strip()
+
+object_placements_solved_ex = f'{story}\n\n{reasoning}'
--- a/opencompass/datasets/musr/team_allocation_solved_ex.py
+++ b/opencompass/datasets/musr/team_allocation_solved_ex.py
@ -0,0 +1,72 @@
+# flake8: noqa: E501
+story = '''
+In the quaint community of Midvale, the local school stood as a beacon of enlightenment, nurturing the minds of the next generation. The teachers, the lifeblood of this institution, were tasked with the noble duty of education, while the unsung heroes—the maintenance crew—ensured the smooth functioning of the school's infrastructure. Amidst this, three town residents, Angela, Greg, and Travis, found themselves at a juncture of life where they were presented with the opportunity to serve in one of these crucial roles. The challenge now lay in the hands of the manager, who had to assign them to either teaching or maintenance, a decision that would set the course for their contributions to the school.
+
+Angela was a fiercely independent woman, beset with a unique set of strengths and weaknesses. She was a woman of very few words, often finding it hard to articulate her thoughts and explain things clearly. Venturing into education seemed a maze with her apathetic attitude towards learning. She was also seen to be disinterested in reading and the literary field as a whole. This was a juxtaposition to her inability to contribute to maintenance duties because of her fear of tools and machinery, a sinister remnant of a past accident that still haunted her. The basic handyman skills, which most locals learned growing up, were also absent from her repertoire.
+
+Angela's interactions with Greg and Travis further complicated the equation. On one hand, Greg and Angela had a habit of arguing constantly over trivial matters, which once culminated in their failure to complete a shared basic training exercise adequately. On the other hand, Angela and Travis simply had nothing in common. Their conversations were often fraught with awkward silences, indicative of their lack of shared interests. This lack of coordination was epitomized during a recent team-building exercise when their team finished last.
+
+Greg was the blue-collar type with a broad frame and muscular build. He had a work ethic that never shied away from toiling through the day to get things done. Growing up, he often helped his father with simple home repairs and minor renovations, learning the ropes of basic handiwork. Additionally, Greg had fortified his skills while refurbishing an old shed with Travis, a testament to their compatible personalities. However, his dislike for education was well known throughout town, further amplified by his lack of patience, especially with children.
+
+Travis, the third cog in the wheel, was a man of many peculiarities. His stage fright was almost legendary and made it nearly impossible for him to stand in front of a crowd. Often, the mere thought of it could unnerve him. His physical constitution was lightweight and fragile, and long hours of manual labor made him weary. He also had a revulsion towards dirt that he complained about at every opportune moment. Like the others, studying did not appeal to him much, so much so that he had stopped reading completely after leaving school prematurely.
+
+The manager understood well that a team’s success depends heavily on the contribution and compatibility of each member. He observed, analyzed, and considered. Now, it was up to him to assign roles to Angela, Greg, and Travis. The school needed educators and maintenance staff, and each had to play their part perfectly.
+
+Given the story, how would you uniquely allocate each person to make sure both tasks are accomplished efficiently?
+
+Pick one of the following choices:
+1 - Teaching: Travis, Maintenance: Angela and Greg
+2 - Teaching: Greg, Maintenance: Angela and Travis
+3 - Teaching: Angela, Maintenance: Greg and Travis
+
+You must pick one option. The story should allow you to determine how good each person is at a skill. Roughly, each person is either great, acceptable, or bad at a task. We want to find an optimal assignment of people to tasks that uses their skills as well as possible. In addition, one task will have to have two people assigned to it. The effectiveness of their teamwork (great team, acceptable team, or bad team) also impacts the overall quality of the assignment.
+
+When two people need to work on a task and one is bad at it, they don’t necessarily benefit from the other person being good, unless they work well together.
+
+With different strengths, weaknesses, and interpersonal dynamics at play, you should allocate your team to find the single assignment to ensure that the tasks overall are completed as effectively as possible.
+
+ Explain your reasoning step by step before you answer. Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"
+'''.strip()
+
+reasoning = '''
+Let's solve this by thinking step-by-step. First, we will figure out each person's skill level for each task. Then, we can measure how well they all work together in pairs. From this, we can find the most efficient assignment that maximizes the scores.
+
+Let's start with Angela. Angela can't articulate her thoughts, and she seems unprepared for teaching. So, let's assume her skill level is 1 for teaching. She also is bad at maintenance due to her fear of maintenance. So, let's assume her skill level is 1 for maintenance as well.
+
+Now, let's look at Greg. Greg has a dislike for education and a lack of patience, so let's assume his skill level for maintenance is 1. However, Greg has helped with home repairs and minor renovations, so let's assume his maintenance skill level is 2.
+
+Finally, let's look at Travis. Travis has extreme stage fright, which will make it difficult to teach, so let's assume his teaching skill level is 1. He also has a lightweight and fragile frame as well as hates dirt, so let's assume his maintenance skill level is 1.
+
+Now, let's look at the relationships and how people work together.
+
+Angela and Greg do not get along; they are constantly arguing, so let's assume their ability to work together is 1.
+
+Angela and Travis aren't much better. They both have nothing in common, and they couldn't do a team-building exercise previously, so let's assume their ability to work together is 1.
+
+Finally, Greg and Travis have worked together, and their personalities seem to meld, so let's assume they work well together with a score of 3.
+
+Let's summarize and figure out the best assignment.
+
+Angela is bad at teaching. (1)
+Angela is bad at maintenance. (1)
+Angela does not work well with Greg. (1)
+Angela does not work well with Travis. (1)
+Greg is bad at teaching. (1)
+Greg is okay with maintenance. (2)
+Greg and Travis work well together. (3)
+Travis is bad at teaching. (1)
+Travis is bad at maintenance. (1)
+
+Now, let's find the best assignment.
+
+Option 1: Travis as a teacher (1) + Angela working in maintenance (1) + Greg working in maintenance (2) + Angela and Greg work badly together (1) = 5
+Option 2: Greg as a teacher (1) + Angela working in maintenance (1) + Travis working in maintenance (1) + Angela and Travis work badly together (1) = 4
+Option 3: Angela as a teacher (1) + Greg working in maintenance (2) + Travis working in maintenance (1) + Greg and Travis work well together (3) = 7
+
+So, from this, we can see Option 3 has the maximum score.
+
+ANSWER: 3
+
+'''.strip()
+
+team_allocation_solved_ex = f'{story}\n\n{reasoning}'
--- a/opencompass/datasets/musr/tree.py
+++ b/opencompass/datasets/musr/tree.py
@ -0,0 +1,739 @@
+# flake8: noqa: E501
+"""WARNING (or more like an aggressive note).
+
+A lot of functionality was implemented here for earlier experiments.  Most of which is not used.  We have left it here
+for backwards compatibility with the current dataset as well as because why not.
+
+ALSO NOTE:
+
+This file was created to have no dependencies on anything in the repo for a reason.  You can copy this file into your
+own project and use the classes to parse/visualize/edit the logic trees in the dataset or create your own.
+
+FINAL NOTE:
+
+See examples of how to create LogicNodes and LogicTrees in the __main__ part of the file.
+"""
+
+import random
+from copy import deepcopy
+from enum import Enum
+from typing import Any, Dict, List
+
+import numpy as np
+
+
+class LogicNodeOperatorType:
+    """How should the deduction combine the nodes (choose will randomly sample
+    and/or when populate is called)"""
+    AND = 'and'
+    OR = 'or'
+    CHOOSE = 'choose'
+
+
+class LogicNodeFactType:
+    """Is a node explicit (mentioned in the story) or commonsense knowledge
+    (left unsaid)"""
+    EXPLICIT = 'explicit'
+    COMMONSENSE = 'commonsense'
+
+
+class LogicNodeConstraints:
+    """Useful for things like children = ['X is the murderer', 'Y is the murderer', 'Z is the murderer'], we no longer use this structure though."""
+    ONLY_ONE_CAN_BE_TRUE = 'Only one child can be true'
+
+
+class LogicNodeDeductionType:
+    """What type of deduction should be used here (not used currently)"""
+    SYLLOGISM = 'syllogism'
+    TEMPORAL = 'temporal'
+    SPATIAL = 'spatial'
+    CHOOSE = 'choose'
+
+
+class LogicNode:
+    """A LogicNode is a tree primitive.
+
+    It is either a deduction or a leaf fact.  Leaf facts are the ones that we
+    use in story generation (if they are explicit facts and not commonsense).
+    """
+    value: str
+    children: List['LogicNode']
+    fact_type: str
+    operator: str
+    constraints: List[str]
+    deduction_type: str
+    prunable: bool
+    can_be_leaf: bool
+
+    def __init__(
+        self,
+        value: str = '',
+        children: List['LogicNode'] = None,
+        operator: str = LogicNodeOperatorType.OR,
+        fact_type: str = LogicNodeFactType.EXPLICIT,
+        constraints: List[str] = (),
+        deduction_type: str = None,
+        prunable: bool = True,
+        can_be_leaf: bool = False,
+        frozen: bool = False,
+    ):
+        """
+        :param value: Content for this specific node (also the deduction of the children).
+        :param children: The children for this node.
+        :param operator: Should the children be "And"ed or "Or"ed to create the deduction (the content of this node).
+        :param fact_type: Explicit or commonsense
+        :param constraints: Not used anymore (see LogicNodeConstraints)
+        :param deduction_type: Not used anymore (see LogicNodeDeductionType)
+        :param prunable: Can this node be removed from the tree (we don't prune in our datasets)
+        :param can_be_leaf: Can this node be a leaf node (usually false for nodes that you are injecting manually)
+        :param frozen: Should we add/prune children in the populate function (if frozen, no children will be added or removed, but the children may have children appended/pruned from them).
+        """
+        self.value = value
+        if children is None:
+            children = []
+        self.children = children
+        self.operator = operator
+        self.fact_type = fact_type
+        self.constraints = constraints
+        self.deduction_type = deduction_type
+        self.prunable = prunable
+        self.can_be_leaf = can_be_leaf
+        self.frozen = frozen
+        self.parent = None
+
+    @property
+    def children(self):
+        return self._children
+
+    @children.setter
+    def children(self, children: List['LogicNode']):
+        self._children = children
+        for c in self.children:
+            c.parent = self
+
+    def __str__(self):
+        line = []
+        cnsts = ', '.join([str(x.value) for x in self.constraints])
+
+        if self.value and self.value != '':
+            line.append(self.value)
+        if len(self.children) > 0:
+            line.append(self.operator)
+        else:
+            line.append(self.fact_type)
+
+        if self.deduction_type:
+            line.append(self.deduction_type)
+
+        if len(self.constraints) > 0:
+            line.append(cnsts)
+
+        if len(self.children) > 0:
+            line.append(f'children: {len(self.children)}')
+
+        return ' | '.join(line)
+
+    def __repr__(self):
+        return str(self)
+
+    def to_json(self):
+        return {
+            'value': self.value,
+            'children': [x.to_json() for x in self.children],
+            'fact_type': self.fact_type,
+            'operator': self.operator,
+            'constraints': self.constraints,
+            'deduction_type': self.deduction_type,
+            'prunable': self.prunable,
+            'can_be_leaf': self.can_be_leaf
+        }
+
+    @classmethod
+    def from_json(cls, js):
+        js['children'] = [LogicNode.from_json(x) for x in js['children']]
+        return cls(**js)
+
+
+class LogicTree:
+    """Main datastructure used when creating a MuSR example.
+
+    It's basically a standard tree with some parameters controlling the shape.
+    """
+
+    nodes: List[LogicNode]
+
+    chance_of_or: float
+    chance_of_cs_fact: float
+    depth: int
+    chance_to_prune: float
+    chance_to_prune_all: float
+    bf_factor: Dict[int, float]
+    deduction_type_sample_rate: Dict[LogicNodeDeductionType, float]
+    root_structure: List[List[LogicNode]] = ()
+
+    def __init__(self,
+                 chance_of_or: float = 0.3,
+                 chance_of_cs_fact: float = 0.1,
+                 depth: int = 2,
+                 chance_to_prune: float = 0.6,
+                 chance_to_prune_all: float = 0.2,
+                 bf_factor: Dict[int, float] = None,
+                 deduction_type_sample_rate: Dict[LogicNodeDeductionType,
+                                                  float] = None,
+                 enforce_cs_fact_per_level: bool = False,
+                 root_structure: List[Any] = (),
+                 nodes: List[LogicNode] = (),
+                 populate: bool = True,
+                 prune: bool = True):
+        """
+        :param chance_of_or: (not used) how often should a node with children be an OR
+        :param chance_of_cs_fact: (not used) how often should there be a commonsense node
+        :param depth: How deep should a tree go
+        :param chance_to_prune: Percentage chance of pruning a node
+        :param chance_to_prune_all: Percentage chance of pruning all children from a node.
+        :param bf_factor: Branching factor (dictionary of percentages {1: 0.33, 2:0.33, 3:0.33} for example.
+        :param deduction_type_sample_rate: (not used, see bf_factor and LogicNodeDeductionType)
+        :param enforce_cs_fact_per_level: Enforce 1 commonsense fact per level in the tree (we use this instead of chance_of_cs_fact)
+        :param root_structure: List of LogicNodes to build off of.
+        :param nodes: List of LogicNodes to define the LogicTree on (we will not populate/prune the tree if this is filled)
+        :param populate: Should we populate children for the tree according to the other parameters?
+        :param prune: Should we prune the children for the tree according to the other parameters?
+        """
+        self.chance_of_or = chance_of_or
+        self.chance_of_cs_fact = chance_of_cs_fact
+        self.depth = depth
+        self.chance_to_prune = chance_to_prune
+        self.chance_to_prune_all = chance_to_prune_all
+        self.bf_factor = bf_factor
+        self.enforce_cs_fact_per_level = enforce_cs_fact_per_level
+
+        if not bf_factor:
+            self.bf_factor = {2: 0.8, 3: 0.2}
+        if not deduction_type_sample_rate:
+            deduction_type_sample_rate = {
+                LogicNodeDeductionType.SYLLOGISM: 1.0
+            }
+
+        self.deduction_type_sample_rate = deduction_type_sample_rate
+        self.root_structure = root_structure
+
+        if len(nodes) > 0:
+            self.nodes = nodes
+        else:
+
+            if root_structure is not None and len(root_structure) > 0:
+                self.nodes = root_structure
+            else:
+                self.nodes = [
+                    LogicNode('root', operator=LogicNodeOperatorType.AND)
+                ]
+
+            if populate:
+                [self.populate(x, 1) for x in self.nodes]
+            if prune:
+                [self.prune(x, 1) for x in self.nodes]
+
+    def __str__(self):
+        return self.print_tree()
+
+    def get_facts(self,
+                  include_cs: bool = False,
+                  include_deductions_from_level: int = -1,
+                  no_facts_after_depth: int = -1):
+        """Get a list of LogicNodes from the tree. By default, you will get the
+        explicit leaf nodes.
+
+        :param include_cs: Include the commonsense nodes from all levels.
+        :param include_deductions_from_level: Include any intermediate deduction nodes from the specified level and deeper.
+        :param no_facts_after_depth: Essentially tree the deductions at the specified depth as leaf nodes.
+        """
+
+        def recurse_facts(_node: LogicNode, depth: int = 0) -> List[str]:
+            node = deepcopy(_node)
+            if depth >= no_facts_after_depth and no_facts_after_depth > -1:
+                node.children = []
+
+            facts = []
+
+            if node.fact_type == LogicNodeFactType.EXPLICIT and len(
+                    node.children) == 0:
+                facts.append(node)
+            if node.fact_type == LogicNodeFactType.COMMONSENSE and include_cs and len(
+                    node.children) == 0:
+                facts.append(node)
+            if len(
+                    node.children
+            ) > 0 and include_deductions_from_level <= depth and include_deductions_from_level > -1:
+                facts.append(node)
+
+            for child in node.children:
+                facts.extend(recurse_facts(child, depth + 1))
+            return list(set(facts))
+
+        facts = []
+        for n in self.nodes:
+            facts.extend(recurse_facts(n))
+        return facts
+
+    def print_tree(self, node=None, level=0):
+        """Deprecated (not used)"""
+        if node is None:
+            node = self.nodes[0]
+        line = '-' * level * 4 + str(node) + (' | ' + str(node.operator) if
+                                              len(node.children) > 0 else '')
+
+        for child in node.children:
+            line += '\n' + self.print_tree(child, level + 1)
+
+        return line
+
+    def print_for_gpt(self,
+                      node=None,
+                      level=0,
+                      pad_char=' ',
+                      pad_space=4,
+                      print_forward=True,
+                      print_conjection_types: bool = False,
+                      print_reasoning_types: bool = False,
+                      ignore_value_after_depth: int = -1,
+                      print_only_nodes_with_value: bool = False):
+        """Complex print function.  We often use it as
+        print_for_gpt(pad_space=1, pad_char='> ')
+
+        However, more complex arguments can be used to control what is printed.
+
+        This returns a string that must be printed (don't be confused by the method name.)
+
+        :param node: Start at a specific node.
+        :param level: Controls how much tabbing is done when printing the current node.
+        :param pad_char: Char to use that specifies depth ('> ' at depth 3 will look like '> > > ' if you have pad_space equal to 1 for example)
+        :param pad_space: How many spaces to include between pad_chars
+        :param print_forward: Print the tree with parent nodes first.
+        :param print_conjection_types: Print the Ands and Ors per deduction (not used)
+        :param print_reasoning_types: Print the deduction types (not used)
+        :param ignore_value_after_depth: Ignore content of the nodes once a depth is met
+        :param print_only_nodes_with_value: Ignore nodes without content.
+        """
+
+        line = ''
+
+        if node is None:
+            node = self.nodes[0]
+
+        if not print_forward:
+            for child in node.children:
+                v = self.print_for_gpt(
+                    child,
+                    level + 1,
+                    pad_char=pad_char,
+                    pad_space=pad_space,
+                    print_forward=print_forward,
+                    ignore_value_after_depth=ignore_value_after_depth,
+                    print_only_nodes_with_value=print_only_nodes_with_value)
+                if v != '':
+                    line += v + '\n'
+
+        ignore_val = ignore_value_after_depth > -1 and ignore_value_after_depth < level
+        ignore_line = print_only_nodes_with_value and node.value == ''
+
+        if ignore_line:
+            line_val = ''
+        else:
+            line_val = (node.value + ' | ' if node.value != '' and not ignore_val else '') + (
+                ('Fact From Story' if node.fact_type == LogicNodeFactType.EXPLICIT else 'Commonsense Knowledge') \
+                    if len(node.children) == 0 else 'Deduced Fact')
+
+            if level == 0:
+                line_val = (node.value + ' | ' if node.value != '' else
+                            '') + 'Deduced Root Conclusion'
+
+            if len(node.children) > 0 and (print_conjection_types
+                                           or print_reasoning_types):
+                if print_conjection_types:
+                    line_val += f' ({node.operator}'
+                else:
+                    line_val += f'('
+                if node.deduction_type and print_reasoning_types:
+                    line_val += f' | {node.deduction_type})'
+                else:
+                    line_val += ')'
+
+            if len(node.constraints) > 0:
+                cnsts = ', '.join([str(x) for x in node.constraints])
+                line_val += f' constraints: [{cnsts}]'
+
+            line += pad_char * level * pad_space + line_val
+
+        if print_forward:
+            for child in node.children:
+                v = self.print_for_gpt(
+                    child,
+                    level + 1,
+                    pad_char=pad_char,
+                    pad_space=pad_space,
+                    print_forward=print_forward,
+                    ignore_value_after_depth=ignore_value_after_depth,
+                    print_only_nodes_with_value=print_only_nodes_with_value)
+                if v != '':
+                    line += '\n' + v
+
+        return line
+
+    def populate(self, node: LogicNode, current_depth: int = 1):
+        if node.operator == LogicNodeOperatorType.CHOOSE:
+            node.operator = LogicNodeOperatorType.OR \
+                if random.random() < self.chance_of_or else LogicNodeOperatorType.AND
+        if node.deduction_type == LogicNodeDeductionType.CHOOSE:
+            if node.operator != LogicNodeOperatorType.AND:
+                node.deduction_type = None
+            else:
+                node.deduction_type = random.choices(
+                    list(self.deduction_type_sample_rate.keys()),
+                    list(self.deduction_type_sample_rate.values()),
+                    k=1)[0]
+
+        if not node.frozen:
+
+            bf = max(
+                0,
+                random.choices(list(self.bf_factor.keys()),
+                               list(self.bf_factor.values()),
+                               k=1)[0] - len(node.children))
+
+            if bf > 0:
+
+                new_nodes = []
+                one_fact_is_cs = False
+                for idx in range(bf):
+                    roll_for_or = random.random()
+                    fact_type = LogicNodeFactType.COMMONSENSE \
+                        if random.random() < self.chance_of_cs_fact and not one_fact_is_cs else \
+                        LogicNodeFactType.EXPLICIT
+
+                    if roll_for_or > self.chance_of_or and\
+                            current_depth < self.depth and\
+                            not fact_type == LogicNodeFactType.COMMONSENSE:
+                        new_nodes.append(
+                            LogicNode(
+                                f'',
+                                operator=LogicNodeOperatorType.AND,
+                                fact_type=fact_type,
+                                deduction_type=random.choices(
+                                    list(self.deduction_type_sample_rate.keys(
+                                    )),
+                                    list(self.deduction_type_sample_rate.
+                                         values()),
+                                    k=1)[0],
+                                prunable=True,
+                                can_be_leaf=True,
+                            ))
+                    else:
+                        new_nodes.append(
+                            LogicNode(f'',
+                                      operator=LogicNodeOperatorType.OR,
+                                      fact_type=fact_type,
+                                      prunable=True,
+                                      can_be_leaf=True))
+
+                    if fact_type == LogicNodeFactType.COMMONSENSE:
+                        node.operator = LogicNodeOperatorType.AND
+                        if not node.deduction_type:
+                            node.deduction_type = random.choices(
+                                list(self.deduction_type_sample_rate.keys()),
+                                list(self.deduction_type_sample_rate.values()),
+                                k=1)[0]
+                        one_fact_is_cs = True
+
+                if not one_fact_is_cs and self.enforce_cs_fact_per_level:
+                    new_nodes.append(
+                        LogicNode(f'',
+                                  operator=LogicNodeOperatorType.OR,
+                                  fact_type=LogicNodeFactType.COMMONSENSE,
+                                  prunable=False,
+                                  can_be_leaf=True))
+
+                node.children.extend(new_nodes)
+
+        if current_depth < self.depth:
+            for node in node.children:
+                if node.fact_type == LogicNodeFactType.COMMONSENSE:
+                    continue
+                self.populate(node, current_depth + 1)
+
+    def prune(self, node: LogicNode, current_depth: int = 1):
+        to_prune = []
+
+        if current_depth > 1 and node.can_be_leaf:
+            if random.random() < self.chance_to_prune_all:
+                node.children = []
+                return
+
+        prunable = [x for x in node.children if x.prunable]
+        if (len(prunable) > 1 and node.operator == LogicNodeOperatorType.OR or\
+                len(prunable) > 2 and node.operator == LogicNodeOperatorType.AND) and\
+                current_depth <= self.depth:
+
+            if node.prunable:
+                for n in random.sample(
+                        prunable,
+                        len(prunable) -
+                    (1 if node.operator == LogicNodeOperatorType.OR else 2)):
+                    roll_to_prune = random.random()
+                    if roll_to_prune < self.chance_to_prune:
+                        to_prune.append(n)
+
+        node.children = [x for x in node.children if x not in to_prune]
+        for n in node.children:
+            self.prune(n, current_depth + 1)
+
+    def to_json(self):
+        args = {
+            'chance_of_or': self.chance_of_or,
+            'depth': self.depth,
+            'chance_to_prune': self.chance_to_prune,
+            'chance_to_prune_all': self.chance_to_prune_all,
+            'bf_factor': self.bf_factor,
+            'deduction_type_sample_rate': self.deduction_type_sample_rate,
+            'root_structure': [x.to_json() for x in self.root_structure],
+            'nodes': [x.to_json() for x in self.nodes]
+        }
+        return args
+
+    @classmethod
+    def from_json(cls, _js):
+        js = deepcopy(_js)
+        js['nodes'] = [LogicNode.from_json(x) for x in js['nodes']]
+        js['root_structure'] = [
+            LogicNode.from_json(x) for x in js['root_structure']
+        ]
+        return cls(**js)
+
+
+if __name__ == '__main__':
+    """EXAMPLE USES."""
+
+    def tv_scene_ex():
+        root_structure = [
+            LogicNode('A good drama tv scene',
+                      operator=LogicNodeOperatorType.OR,
+                      prunable=False,
+                      can_be_leaf=False,
+                      frozen=True)
+        ]
+
+        root_structure[0].children = [
+            LogicNode('Bob is sad.',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=True,
+                      can_be_leaf=False),
+            LogicNode('John now hates Bob.',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=True,
+                      can_be_leaf=False),
+            LogicNode('Bob bought a car.',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=True,
+                      can_be_leaf=False),
+            LogicNode('Bob wanted to be happy.',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=True,
+                      can_be_leaf=False),
+        ]
+
+        tree = LogicTree(depth=4,
+                         root_structure=root_structure,
+                         bf_factor={
+                             1: 0.5,
+                             2: 0.5
+                         },
+                         chance_of_or=0.0,
+                         chance_of_cs_fact=0.0,
+                         chance_to_prune_all=0.5,
+                         chance_to_prune=0.5,
+                         enforce_cs_fact_per_level=True)
+
+        rep = tree.print_for_gpt(pad_space=1, pad_char='- ')
+        print(rep)
+
+    def eb_ex():
+        root_structure = [
+            LogicNode('',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=False,
+                      can_be_leaf=False)
+        ]
+
+        n = LogicNode('Eruptions block sunlight.',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=False,
+                      can_be_leaf=False,
+                      frozen=True)
+        n.children = [
+            LogicNode('Eruptions produce ash clouds.',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=False,
+                      can_be_leaf=True,
+                      frozen=True),
+            LogicNode('Ash blocks sunlight.',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=False,
+                      can_be_leaf=True,
+                      frozen=True),
+        ]
+
+        g = LogicNode('Eruptions can cause plants to die.',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=True,
+                      can_be_leaf=False,
+                      frozen=True)
+
+        g.children = [
+            n,
+            LogicNode('Producers will die without sunlight.',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=False,
+                      can_be_leaf=True,
+                      frozen=True)
+        ]
+
+        l = LogicNode('',
+                      operator=LogicNodeOperatorType.AND,
+                      prunable=False,
+                      can_be_leaf=False)
+        l.children = [g]
+
+        root_structure[0].children = [l]
+
+        tree = LogicTree(depth=5,
+                         root_structure=root_structure,
+                         bf_factor={
+                             1: 0.3,
+                             2: 0.7
+                         },
+                         chance_of_or=0.0,
+                         chance_of_cs_fact=0.0,
+                         chance_to_prune_all=0.0,
+                         chance_to_prune=0.0,
+                         enforce_cs_fact_per_level=True)
+
+        rep = tree.print_for_gpt(pad_space=1, pad_char='- ')
+        print(rep)
+
+    def murder_mystery_ex():
+        root_structure = [
+            LogicNode('Killer',
+                      operator=LogicNodeOperatorType.OR,
+                      constraints=[LogicNodeConstraints.ONLY_ONE_CAN_BE_TRUE],
+                      prunable=False,
+                      can_be_leaf=False,
+                      frozen=True)
+        ]
+
+        suspect_nodes = [
+            LogicNode(f'Murderer Suspect {idx + 1}',
+                      operator=LogicNodeOperatorType.AND,
+                      prunable=False,
+                      can_be_leaf=False,
+                      frozen=True) for idx in range(1)
+        ]
+        for s in suspect_nodes:
+            s.children = [
+                LogicNode('Suspect has means',
+                          operator=LogicNodeOperatorType.CHOOSE,
+                          prunable=True,
+                          can_be_leaf=False),
+                LogicNode('Suspect has motive',
+                          operator=LogicNodeOperatorType.CHOOSE,
+                          prunable=True,
+                          can_be_leaf=False),
+                LogicNode('Suspect has opportunity',
+                          operator=LogicNodeOperatorType.CHOOSE,
+                          prunable=True,
+                          can_be_leaf=False)
+            ]
+        root_structure[0].children = suspect_nodes
+
+        tree = LogicTree(depth=4,
+                         root_structure=root_structure,
+                         bf_factor={
+                             1: 0.5,
+                             2: 0.5
+                         },
+                         chance_of_or=0.0,
+                         chance_of_cs_fact=0.0,
+                         chance_to_prune_all=0.5,
+                         chance_to_prune=0.5,
+                         enforce_cs_fact_per_level=True)
+
+        rep = tree.print_for_gpt(pad_space=1, pad_char='> ')
+        print(rep)
+
+    def action_ex():
+        root_structure = [
+            LogicNode('Take an action',
+                      operator=LogicNodeOperatorType.OR,
+                      prunable=False,
+                      can_be_leaf=False,
+                      frozen=True)
+        ]
+
+        root_structure[0].children = [
+            LogicNode('Run away',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=False,
+                      can_be_leaf=False,
+                      frozen=True),
+            LogicNode('Fight back',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=False,
+                      can_be_leaf=False,
+                      frozen=True),
+            LogicNode('Hide',
+                      operator=LogicNodeOperatorType.CHOOSE,
+                      prunable=False,
+                      can_be_leaf=False,
+                      frozen=True),
+        ]
+
+        for cidx, c in enumerate(root_structure[0].children):
+            nfacts = random.randint(2, 4)
+
+            for n in range(nfacts):
+                fact = LogicNode('',
+                                 operator=LogicNodeOperatorType.CHOOSE,
+                                 prunable=False,
+                                 can_be_leaf=False,
+                                 frozen=True)
+                fact.children = [
+                    LogicNode('Pro (supporting the parent action)',
+                              operator=LogicNodeOperatorType.CHOOSE,
+                              prunable=True,
+                              can_be_leaf=False,
+                              frozen=False),
+                    LogicNode('Con (counters the sibling Pro only)',
+                              operator=LogicNodeOperatorType.CHOOSE,
+                              prunable=True,
+                              can_be_leaf=False,
+                              frozen=False)
+                ]
+                root_structure[0].children[cidx].children.append(fact)
+
+        tree = LogicTree(depth=4,
+                         root_structure=root_structure,
+                         bf_factor={
+                             1: 0.25,
+                             2: 0.5,
+                             3: 0.25
+                         },
+                         chance_of_or=0.0,
+                         chance_of_cs_fact=0.0,
+                         chance_to_prune_all=0.5,
+                         chance_to_prune=0.75,
+                         enforce_cs_fact_per_level=True)
+
+        rep = tree.print_for_gpt(pad_space=1, pad_char='- ')
+        print(rep)
+
+    tv_scene_ex()
+    eb_ex()
+    action_ex()
--- a/opencompass/models/bailing_api_oc.py
+++ b/opencompass/models/bailing_api_oc.py
@ -2,6 +2,7 @@ import concurrent
 import concurrent.futures
 import os
 import socket
+import time
 import traceback
 from typing import Dict, List, Optional, Union

@ -20,6 +21,8 @@ from .base_api import BaseAPIModel

 PromptType = Union[PromptList, str]

+BAILING_RETRY_DELAY: int = 30
+

 class HTTPAdapterWithSocketOptions(HTTPAdapter):

@ -200,6 +203,9 @@ class BailingAPI(BaseAPIModel):
                    break  # success
                elif response.status_code == 426:
                    retry_num += 1  # retry
+                elif response.status_code in [429, 500, 504]:
+                    time.sleep(BAILING_RETRY_DELAY)
+                    retry_num += 1  # retry
                else:
                    raise ValueError(f'Status code = {response.status_code}')
            else:
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -526,7 +526,7 @@ class OpenAISDK(OpenAI):

    def _generate(self, input: PromptList | str, max_out_len: int,
                  temperature: float) -> str:
-        from openai import BadRequestError
+        from openai import APIStatusError, BadRequestError
        assert isinstance(input, (str, PromptList))

        # max num token for gpt-3.5-turbo is 4097
@ -616,7 +616,7 @@ class OpenAISDK(OpenAI):
                            from the API provider.')
                return responses.choices[0].message.content

-            except BadRequestError as e:
+            except (BadRequestError, APIStatusError) as e:
                # Handle BadRequest status
                # You can specify self.status_code_mappings to bypass \
                # API sensitivity blocks
@ -625,12 +625,10 @@ class OpenAISDK(OpenAI):
                status_code = e.status_code
                if (status_code is not None
                        and status_code in self.status_code_mappings):
-                    original_error_message = e.body.get('message')
                    error_message = self.status_code_mappings[status_code]
-                    self.logger.info(
-                        f'Status Code: {status_code}, '
-                        f'Original Error Message: {original_error_message},'
-                        f'Return Message: {error_message} ')
+                    self.logger.info(f'Status Code: {status_code},\n'
+                                     f'Original Error Message: {e},\n'
+                                     f'Return Message: {error_message} ')
                    return error_message
                else:
                    self.logger.error(e)
--- a/opencompass/models/turbomind_with_tf_above_v4_33.py
+++ b/opencompass/models/turbomind_with_tf_above_v4_33.py
@ -87,6 +87,7 @@ class TurboMindModelwithChatTemplate(BaseModel):
    def generate(self,
                 inputs: List[str],
                 max_out_len: int,
+                 min_out_len: Optional[int] = None,
                 stopping_criteria: List[str] = [],
                 do_sample: Optional[bool] = None,
                 temperature: float = 1.0,
@ -123,7 +124,11 @@ class TurboMindModelwithChatTemplate(BaseModel):

        gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG)
        gen_config.update(self.gen_config)
-        if do_sample or self.gen_config['do_sample']:
+        if max_out_len is not None:
+            gen_config['max_new_tokens'] = max_out_len
+        if min_out_len is not None:
+            gen_config['min_new_tokens'] = min_out_len
+        if do_sample or ('do_sample' in self.gen_config and self.gen_config['do_sample']):
            gen_config['top_k'] = 40
            gen_config['temperature'] = temperature
        else:
--- a/opencompass/models/vllm.py
+++ b/opencompass/models/vllm.py
@ -101,7 +101,11 @@ class VLLM(BaseModel):
        if not self.lora_path:
            outputs = self.model.generate(inputs, sampling_kwargs)
        else:
-            outputs = self.model.generate(inputs, sampling_kwargs, lora_request=LoRARequest("sql_adapter", 1, self.lora_path))
+            outputs = self.model.generate(inputs,
+                                          sampling_kwargs,
+                                          lora_request=LoRARequest(
+                                              'sql_adapter', 1,
+                                              self.lora_path))

        prompt_list, output_strs = [], []
        for output in outputs:
--- a/opencompass/runners/volc.py
+++ b/opencompass/runners/volc.py
@ -249,7 +249,7 @@ class VOLCRunner(BaseRunner):
        with open(config_path) as fp:
            volc_cfg = yaml.safe_load(fp)
        if num_gpus <= 0:
-            flavor = 'ml.c1ie.2xlarge'
+            flavor = 'ml.c3i.2xlarge'
        elif num_gpus == 1:
            flavor = 'ml.pni2l.3xlarge'
        elif num_gpus == 2:
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@ -167,7 +167,7 @@ DATASETS_MAPPING = {
    "opencompass/math": {
        "ms_id": "opencompass/math",
        "hf_id": "opencompass/math",
-        "local": "./data/math/math.json",
+        "local": "./data/math/",
    },
    # MMLU
    "opencompass/mmlu": {
@ -326,10 +326,24 @@ DATASETS_MAPPING = {
        "ms_id": "",
        "hf_id": "",
        "local": "./data/mmmlu_lite",
-    }
+    },
+    "opencompass/musr": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/musr",
+    },
+    "opencompass/babilong": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/babilong/data/",
+    },
 }

 DATASETS_URL = {
+    "/musr": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/musr.zip",
+        "md5": "7447d2a5bec4586035196102135e2af9",
+    },
    "/mmlu/": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu.zip",
        "md5": "761310671509a239e41c4b717f7fab9c",
@ -360,7 +374,7 @@ DATASETS_URL = {
    },
    "/math/": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip",
-        "md5": "8b1b897259684672055e6fd4fc07c808",
+        "md5": "cb5b4c8378085929e20345174e731fdf",
    },
    "/hellaswag/": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/hellaswag.zip",
@ -426,6 +440,10 @@ DATASETS_URL = {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/fofo.zip",
        "md5": "8a302712e425e27e4292a9369df5b9d3",
    },
+    "subjective/followbench": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/followbench.zip",
+        "md5": "da7a831817c969da15d1e78d4a245d8a",
+    },
    "subjective/mtbench101": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench101.zip",
        "md5": "5d80257bc9929ebe5cfbf6d11184b04c",
@ -496,11 +514,11 @@ DATASETS_URL = {
    },
    "/aime": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip",
-        "md5": "fbe2d0577fc210962a549f8cea1a00c8"
+        "md5": "fbe2d0577fc210962a549f8cea1a00c8",
    },
    "/cmo": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip",
-        "md5": "fad52c81290506a8ca74f46b5400d8fc"
+        "md5": "fad52c81290506a8ca74f46b5400d8fc",
    },
    "/nq-open": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip",
@ -521,5 +539,9 @@ DATASETS_URL = {
    "/WikiBench": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/WikiBench.zip",
        "md5": "6dac1d1a3133fe1effff185cbf71d928",
-    }
+    },
+    "/babilong": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip",
+        "md5": "e400864c31bc58d29eaa3e199751f99b",
+    },
 }
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@ -128,7 +128,10 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
        text = text.strip()
        match = re.search(pattern, text, re.DOTALL)
        if match:
-            outputs = match.group(0)
+            if match.group(1) is not None and match.group(1) != '':
+                outputs = match.group(1)
+            else:
+                outputs = match.group(0)
            for i in options:
                if i in outputs:
                    return i
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -9,6 +9,7 @@ fuzzywuzzy
 h5py
 huggingface_hub<=0.24.7
 immutabledict
+importlib-metadata
 jieba
 json5
 mmengine-lite