[Dataset] Add Smolinstruct configs (#2127 )

* 0-shot Smolinstruct Add 0-shot evaluation and postprocess functions for Smolinstruct * fix acc postprocessor * update 0-shot acc postprocessor * rename 0-shot
[Dataset] Add SuperGPQA subfield configs (#2124 )
2025-05-30 16:03:24 +08:00 · 2025-05-29 14:09:08 +08:00 · 2025-05-28 14:12:58 +08:00 · 2025-05-27 19:41:13 +08:00 · 2025-05-22 16:47:57 +08:00 · 2025-05-20 16:46:55 +08:00
3065 changed files with 170801 additions and 18786 deletions
--- a/.codespellrc
+++ b/.codespellrc
@ -2,4 +2,4 @@
 skip = *.ipynb
 count =
 quiet-level = 3
-ignore-words-list = nd, ans, ques, rouge, softwares
+ignore-words-list = nd, ans, ques, rouge, softwares, wit
--- a/.github/scripts/eval_regression_api.py
+++ b/.github/scripts/eval_regression_api.py
@ -0,0 +1,42 @@
 from mmengine.config import read_base
 from opencompass.models.openai_api import OpenAISDK
 with read_base():
    # choose a list of datasets
    from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
        gsm8k_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.race.race_gen import \
        race_datasets  # noqa: F401, E501
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ],
    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )
 models = [
    dict(
        abbr='lmdeploy-api-test',
        type=OpenAISDK,
        key='EMPTY',
        openai_api_base='http://localhost:23333/v1',
        path='internlm3',
        tokenizer_path='internlm/internlm3-8b-instruct',
        rpm_verbose=True,
        meta_template=api_meta_template,
        query_per_second=128,
        max_out_len=1024,
        max_seq_len=4096,
        temperature=0.01,
        batch_size=128,
        retry=20,
    )
 ]
 for d in datasets:
    d['reader_cfg']['test_range'] = '[0:16]'
--- a/.github/scripts/eval_regression_base_fullbench.py
+++ b/.github/scripts/eval_regression_base_fullbench.py
@ -0,0 +1,210 @@
 from mmengine.config import read_base
 with read_base():
    from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \
        ARC_c_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
        bbh_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
        cmmlu_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.dingo.dingo_gen import \
        datasets as dingo_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.drop.drop_gen_a2697c import \
        drop_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
        GaokaoBench_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \
        gpqa_datasets  # noqa: F401, E501
    # Corebench v1.7
    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
        gsm8k_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
        hellaswag_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \
        humaneval_datasets as humaneval_v2_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \
        humaneval_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
        math_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
        mathbench_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
        sanitized_mbpp_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
        mmlu_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
        mmlu_pro_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \
        nq_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.race.race_few_shot_ppl import \
        race_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \
        BoolQ_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
        TheoremQA_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
        triviaqa_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \
        wikibench_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
        winogrande_datasets  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
        models as hf_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
        models as lmdeploy_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.bbh import \
        bbh_summary_groups  # noqa: F401, E501
    # Summary Groups
    from opencompass.configs.summarizers.groups.cmmlu import \
        cmmlu_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.GaokaoBench import \
        GaokaoBench_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
        mathbench_2024_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.mmlu import \
        mmlu_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.mmlu_pro import \
        mmlu_pro_summary_groups  # noqa: F401, E501
    from ...volc import infer as volc_infer  # noqa: F401, E501
 race_datasets = [race_datasets[1]]  # Only take RACE-High
 humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
 bbh_datasets = [
    x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
    or 'multistep_arithmetic_two' in x['abbr']
 ]
 cmmlu_datasets = [
    x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
        'ancient_chinese', 'chinese_civil_service_exam',
        'chinese_driving_rule', 'chinese_food_culture',
        'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
        'chinese_teacher_qualification', 'construction_project_management',
        'elementary_chinese', 'elementary_commonsense', 'ethnology',
        'high_school_politics', 'modern_chinese',
        'traditional_chinese_medicine'
    ]
 ]
 mmlu_datasets = [
    x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
        'business_ethics', 'clinical_knowledge', 'college_medicine',
        'global_facts', 'human_aging', 'management', 'marketing',
        'medical_genetics', 'miscellaneous', 'nutrition',
        'professional_accounting', 'professional_medicine', 'virology'
    ]
 ]
 mmlu_pro_datasets = [mmlu_pro_datasets[0]]
 mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
 GaokaoBench_datasets = [
    x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
    or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
 ]
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
 summary_groups = sum(
    [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
 summary_groups.append(
    {
        'name': 'Mathbench',
        'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
    }, )
 summarizer = dict(
    dataset_abbrs=[
        'Language',
        ['race-high', 'accuracy'],
        ['ARC-c', 'accuracy'],
        ['BoolQ', 'accuracy'],
        ['triviaqa_wiki_1shot', 'score'],
        ['nq_open_1shot', 'score'],
        '',
        'General Reasoning',
        ['drop', 'accuracy'],
        ['bbh', 'naive_average'],
        ['GPQA_diamond', 'accuracy'],
        ['hellaswag', 'accuracy'],
        ['TheoremQA', 'score'],
        ['winogrande', 'accuracy'],
        '',
        'Math Calculation',
        ['gsm8k', 'accuracy'],
        ['GaokaoBench', 'weighted_average'],
        'GaokaoBench_2010-2022_Math_II_MCQs',
        'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
        ['math', 'accuracy'],
        ['Mathbench', 'naive_average'],
        '',
        'Knowledge',
        ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
        ['cmmlu', 'naive_average'],
        ['mmlu', 'naive_average'],
        ['mmlu_pro', 'naive_average'],
        '',
        'Code',
        ['openai_humaneval', 'humaneval_pass@1'],
        ['openai_humaneval_v2', 'humaneval_pass@1'],
        ['sanitized_mbpp', 'score'],
        '',
        ['dingo_en_192', 'score'],
        ['dingo_zh_170', 'score'],
        '',
        'mmlu',
        'mmlu-stem',
        'mmlu-social-science',
        'mmlu-humanities',
        ['mmlu-other', 'accuracy'],
        '',
        'cmmlu',
        'cmmlu-stem',
        'cmmlu-social-science',
        'cmmlu-humanities',
        'cmmlu-other',
        ['cmmlu-china-specific', 'accuracy'],
        '',
        'mmlu_pro',
        'mmlu_pro_biology',
        'mmlu_pro_business',
        'mmlu_pro_chemistry',
        'mmlu_pro_computer_science',
        'mmlu_pro_economics',
        'mmlu_pro_engineering',
        'mmlu_pro_health',
        'mmlu_pro_history',
        'mmlu_pro_law',
        'mmlu_pro_math',
        'mmlu_pro_philosophy',
        'mmlu_pro_physics',
        'mmlu_pro_psychology',
        'mmlu_pro_other',
        '',
        'bbh-logical_deduction_seven_objects',
        'bbh-multistep_arithmetic_two',
        '###### MathBench-A: Application Part ######',
        'college',
        'high',
        'middle',
        'primary',
        'arithmetic',
        'mathbench-a (average)',
        '###### MathBench-T: Theory Part ######',
        'college_knowledge',
        'high_knowledge',
        'middle_knowledge',
        'primary_knowledge',
        'mathbench-t (average)',
    ],
    summary_groups=summary_groups,
 )
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
 for d in datasets:
    d['reader_cfg']['test_range'] = '[0:16]'
 for m in models:
    m['abbr'] = m['abbr'] + '_fullbench'
    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
        m['engine_config']['max_batch_size'] = 1
        m['batch_size'] = 1
 models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
--- a/.github/scripts/eval_regression_base_models.py
+++ b/.github/scripts/eval_regression_base_models.py
@ -0,0 +1,129 @@
 from mmengine.config import read_base
 with read_base():
    # choose a list of datasets
    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
        gpqa_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
        gsm8k_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.race.race_ppl import \
        race_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
        winogrande_datasets  # noqa: F401, E501
    # read hf models - chat models
    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
        models as lmdeploy_glm4_9b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
        models as hf_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
        models as lmdeploy_deepseek_67b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2 import \
        lmdeploy_deepseek_v2_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
        models as vllm_deepseek_moe_16b_base_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_2b import \
        models as hf_gemma2_2b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_9b import \
        models as hf_gemma2_9b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma_2b import \
        models as hf_gemma_2b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma_7b import \
        models as hf_gemma_7b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.lmdeploy_gemma_9b import \
        models as lmdeploy_gemma_9b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.vllm_gemma_2b import \
        models as vllm_gemma_2b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.vllm_gemma_7b import \
        models as vllm_gemma_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
        models as hf_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
        models as hf_internlm2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
        models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
        models as lmdeploy_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \
        models as lmdeploy_internlm2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import \
        models as lmdeploy_internlm2_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
        models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
        models as lmdeploy_internlm2_base_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama2_7b import \
        models as hf_llama2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
        models as hf_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
        models as hf_llama3_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
        models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
        models as lmdeploy_llama3_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \
        models as lmdeploy_llama3_70b_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
        models as hf_mistral_7b_v0_3_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \
        models as hf_qwen_2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
        models as hf_qwen_2_5_14b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen_2_5_32b import \
        models as hf_qwen_2_5_32b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
        models as lmdeploy_qwen2_5_1_5b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
        models as lmdeploy_qwen2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b import \
        models as lmdeploy_qwen2_5_32b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b import \
        models as lmdeploy_qwen2_5_72b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
        models as hf_qwen1_5_moe_a2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
        models as hf_qwen2_0_5b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
        models as hf_qwen2_1_5b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_7b import \
        models as hf_qwen2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
        models as lmdeploy_qwen2_1_5b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
        models as lmdeploy_qwen2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b import \
        models as vllm_qwen1_5_0_5b_model  # noqa: F401, E501
    from opencompass.configs.models.yi.hf_yi_1_5_6b import \
        models as hf_yi_1_5_6b_model  # noqa: F401, E501
    from opencompass.configs.models.yi.hf_yi_1_5_9b import \
        models as hf_yi_1_5_9b_model  # noqa: F401, E501
    from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b import \
        models as lmdeploy_yi_1_5_9b_model  # noqa: F401, E501
    from ...volc import infer as volc_infer  # noqa: F401, E501
 race_datasets = [race_datasets[1]]
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
 for d in datasets:
    d['reader_cfg']['test_range'] = '[0:32]'
 for m in models:
    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
        m['engine_config']['max_batch_size'] = 1
        m['batch_size'] = 1
 models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
 summarizer = dict(
    dataset_abbrs=[
        ['gsm8k', 'accuracy'],
        ['GPQA_diamond', 'accuracy'],
        ['race-high', 'accuracy'],
        ['winogrande', 'accuracy'],
    ],
    summary_groups=sum(
        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
 )
--- a/.github/scripts/eval_regression_chat_models.py
+++ b/.github/scripts/eval_regression_chat_models.py
@ -0,0 +1,193 @@
 from mmengine.config import read_base
 with read_base():
    # choose a list of datasets
    from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
        gsm8k_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.race.race_gen import \
        race_datasets  # noqa: F401, E501
    # read hf models - chat models
    from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
        models as hf_glm4_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
        models as lmdeploy_glm4_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.chatglm.vllm_glm4_9b_chat import \
        models as vllm_glm4_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
        models as hf_deepseek_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
        models as lmdeploy_deepseek_67b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
        models as \
        lmdeploy_deepseek_r1_distill_llama_8b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
        models as \
        lmdeploy_deepseek_r1_distill_llama_70b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
        models as \
        lmdeploy_deepseek_r1_distill_qwen_1_5b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
        models as \
        lmdeploy_deepseek_r1_distill_qwen_32b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
        models as lmdeploy_deepseek_v2_5_1210_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
        models as lmdeploy_deepseek_v2_lite_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
        models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
        models as hf_gemma2_2b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
        models as hf_gemma2_9b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_27b_it import \
        models as hf_gemma2_27b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma_2b_it import \
        models as hf_gemma_2b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma_7b_it import \
        models as hf_gemma_7b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
        models as lmdeploy_gemma_9b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
        models as lmdeploy_gemma_27b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
        models as vllm_gemma_7b_it_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
        models as hf_internlm2_5_20b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
        models as hf_internlm3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
        models as lmdeploy_internlm2_5_20b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
        models as lmdeploy_internlm2_chat_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
        models as lmdeploy_internlm2_chat_1_8b_sft_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
        models as lmdeploy_internlm2_chat_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
        models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
        models as lmdeploy_internlm3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
        models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
        models as hf_llama3_1_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_2_3b_instruct import \
        models as hf_llama3_2_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
        models as hf_llama3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \
        models as lmdeploy_llama2_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
        models as lmdeploy_llama3_1_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
        models as lmdeploy_llama3_2_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_3_70b_instruct import \
        models as lmdeploy_llama3_3_70b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
        models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
        models as hf_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
        models as hf_mistral_7b_instruct_v0_3_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_nemo_instruct_2407 import \
        models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
        models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
        models as \
        lmdeploy_mistral_large_instruct_2411_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
        models as lmdeploy_mistral_nemo_instruct_2407_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
        models as \
        lmdeploy_mistral_small_instruct_2409_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
        models as \
        lmdeploy_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
        models as vllm_mistral_7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
        models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
        models as vllm_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
        models as lmdeploy_nemotron_70b_instruct_hf_model  # noqa: F401, E501
    from opencompass.configs.models.phi.hf_phi_4 import \
        models as hf_phi_4_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
        models as hf_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
        models as hf_qwen2_5_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \
        models as hf_qwen2_5_14b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import \
        models as lmdeploy_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import \
        models as lmdeploy_qwen2_5_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
        models as lmdeploy_qwen2_5_14b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
        models as lmdeploy_qwen2_5_72b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
        models as hf_qwen1_5_0_5b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
        models as hf_qwen2_1_5b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \
        models as hf_qwen2_7b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
        models as lmdeploy_qwen2_1_5b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
        models as lmdeploy_qwen2_7b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b_chat import \
        models as vllm_qwen1_5_0_5b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import \
        models as hf_yi_1_5_6b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
        models as hf_yi_1_5_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.yi.lmdeploy_yi_1_5_6b_chat import \
        models as lmdeploy_yi_1_5_6b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
        models as lmdeploy_yi_1_5_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import \
        models as lmdeploy_yi_1_5_34b_chat_model  # noqa: F401, E501
    from ...volc import infer as volc_infer  # noqa: F401, E501
 hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
 race_datasets = [race_datasets[1]]
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ],
    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )
 for d in datasets:
    d['reader_cfg']['test_range'] = '[0:32]'
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 for m in models:
    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
        m['engine_config']['max_batch_size'] = 1
        m['batch_size'] = 1
 models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
 summarizer = dict(
    dataset_abbrs=[
        'gsm8k',
        'race-middle',
        'race-high',
    ],
    summary_groups=sum(
        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
 )
--- a/.github/scripts/eval_regression_chat_obj_fullbench.py
+++ b/.github/scripts/eval_regression_chat_obj_fullbench.py
@ -0,0 +1,317 @@
 from mmengine.config import read_base
 with read_base():
    # read hf models - chat models
    # Dataset
    from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \
        aime2024_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
        ARC_c_datasets  # noqa: F401, E501
    # remove because of oom
    # from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
        bbh_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_complete_gen_faf748 import \
        bigcodebench_hard_complete_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_8815eb import \
        bigcodebench_hard_instruct_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
        cmmlu_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
        cmo_fib_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
        drop_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \
        ds1000_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
        GaokaoBench_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
        gpqa_datasets  # noqa: F401, E501
    # new datasets in Fullbench v1.1
    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_6e39a4 import \
        gsm8k_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
        hellaswag_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
        humaneval_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.humanevalx.humanevalx_gen_3d84a3 import \
        humanevalx_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
        ifeval_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
        korbench_0shot_single_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
        LCB_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.math.math_0shot_gen_11c4b5 import \
        math_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
        mathbench_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
        sanitized_mbpp_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
        mmlu_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
        mmlu_pro_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \
        mmmlu_lite_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.musr.musr_gen_3622bb import \
        musr_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
        nq_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
        race_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \
        SciCode_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \
        BoolQ_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
        teval_datasets as teval_en_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
        teval_datasets as teval_zh_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
        TheoremQA_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
        triviaqa_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \
        wikibench_datasets  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
    # Summary Groups
    # Summary Groups
    from opencompass.configs.summarizers.groups.bbh import \
        bbh_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.cmmlu import \
        cmmlu_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.ds1000 import \
        ds1000_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.GaokaoBench import \
        GaokaoBench_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.humanevalx import \
        humanevalx_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.korbench import \
        korbench_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
        mathbench_2024_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.mmlu import \
        mmlu_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.mmlu_pro import \
        mmlu_pro_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.musr_average import \
        summarizer as musr_summarizer  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.scicode import \
        scicode_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.teval import \
        teval_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.mmmlu_lite import \
        mmmlu_summary_groups  # noqa: F401, E501
    from ...volc import infer as volc_infer  # noqa: F401, E501
 # For HumanEval-X Evaluation
 # Apply the evaluator ip_address and port
 race_datasets = [race_datasets[1]]
 for item in humanevalx_datasets:
    item['eval_cfg']['evaluator'][
        'ip_address'] = 'codeeval.opencompass.org.cn/humanevalx'
    item['eval_cfg']['evaluator']['port'] = ''
 # For DS-1000 Evaluation
 # Apply the evaluator ip_address and port
 for item in ds1000_datasets:
    item['eval_cfg']['evaluator'][
        'ip_address'] = 'codeeval.opencompass.org.cn/ds1000'
    item['eval_cfg']['evaluator']['port'] = ''
 bbh_datasets = [
    x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
    or 'multistep_arithmetic_two' in x['abbr']
 ]
 cmmlu_datasets = [
    x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
        'ancient_chinese', 'chinese_civil_service_exam',
        'chinese_driving_rule', 'chinese_food_culture',
        'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
        'chinese_teacher_qualification', 'construction_project_management',
        'elementary_chinese', 'elementary_commonsense', 'ethnology',
        'high_school_politics', 'modern_chinese',
        'traditional_chinese_medicine'
    ]
 ]
 mmlu_datasets = [
    x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
        'business_ethics', 'clinical_knowledge', 'college_medicine',
        'global_facts', 'human_aging', 'management', 'marketing',
        'medical_genetics', 'miscellaneous', 'nutrition',
        'professional_accounting', 'professional_medicine', 'virology'
    ]
 ]
 mmlu_pro_datasets = [mmlu_pro_datasets[0]]
 mmmlu_lite_datasets = [
    x for x in mmmlu_lite_datasets if 'mmlu_lite_AR-XY' in x['abbr']
 ]
 mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
 GaokaoBench_datasets = [
    x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
    or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
 ]
 datasets = sum(
    (v for k, v in locals().items() if k.endswith('_datasets')
     and 'scicode' not in k.lower() and 'teval' not in k),
    [],
 )
 datasets += teval_en_datasets
 datasets += teval_zh_datasets
 # datasets += SciCode_datasets
 musr_summary_groups = musr_summarizer['summary_groups']
 summary_groups = sum(
    [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
 summary_groups.append(
    {
        'name': 'Mathbench',
        'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
    }, )
 # Summarizer
 summarizer = dict(
    dataset_abbrs=[
        'Language',
        ['race-high', 'accuracy'],
        ['ARC-c', 'accuracy'],
        ['BoolQ', 'accuracy'],
        ['triviaqa_wiki_1shot', 'score'],
        ['nq_open_1shot', 'score'],
        ['mmmlu_lite', 'naive_average'],
        '',
        'Instruction Following',
        ['IFEval', 'Prompt-level-strict-accuracy'],
        '',
        'General Reasoning',
        ['drop', 'accuracy'],
        ['bbh', 'naive_average'],
        ['GPQA_diamond', 'accuracy'],
        ['hellaswag', 'accuracy'],
        ['TheoremQA', 'score'],
        ['musr_average', 'naive_average'],
        ['korbench_single', 'naive_average'],
        ['ARC_Prize_Public_Evaluation', 'accuracy'],
        '',
        'Math Calculation',
        ['gsm8k', 'accuracy'],
        ['GaokaoBench', 'weighted_average'],
        ['math', 'accuracy'],
        ['cmo_fib', 'accuracy'],
        ['aime2024', 'accuracy'],
        ['Mathbench', 'naive_average'],
        '',
        'Knowledge',
        ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
        ['cmmlu', 'naive_average'],
        ['mmlu', 'naive_average'],
        ['mmlu_pro', 'naive_average'],
        '',
        'Code',
        ['openai_humaneval', 'humaneval_pass@1'],
        ['sanitized_mbpp', 'score'],
        ['humanevalx', 'naive_average'],
        ['ds1000', 'naive_average'],
        ['lcb_code_generation', 'pass@1'],
        ['lcb_code_execution', 'pass@1'],
        ['lcb_test_output', 'pass@1'],
        ['bigcodebench_hard_instruct', 'pass@1'],
        ['bigcodebench_hard_complete', 'pass@1'],
        '',
        'Agent',
        ['teval', 'naive_average'],
        ['SciCode', 'accuracy'],
        ['SciCode', 'sub_accuracy'],
        '',
        'bbh-logical_deduction_seven_objects',
        'bbh-multistep_arithmetic_two',
        '',
        'mmlu',
        'mmlu-stem',
        'mmlu-social-science',
        'mmlu-humanities',
        'mmlu-other',
        '',
        'cmmlu',
        'cmmlu-stem',
        'cmmlu-social-science',
        'cmmlu-humanities',
        'cmmlu-other',
        'cmmlu-china-specific',
        '',
        'mmlu_pro',
        'mmlu_pro_biology',
        'mmlu_pro_business',
        'mmlu_pro_chemistry',
        'mmlu_pro_computer_science',
        'mmlu_pro_economics',
        'mmlu_pro_engineering',
        'mmlu_pro_health',
        'mmlu_pro_history',
        'mmlu_pro_law',
        'mmlu_pro_math',
        'mmlu_pro_philosophy',
        'mmlu_pro_physics',
        'mmlu_pro_psychology',
        'mmlu_pro_other',
        '',
        'ds1000_Pandas',
        'ds1000_Numpy',
        'ds1000_Tensorflow',
        'ds1000_Scipy',
        'ds1000_Sklearn',
        'ds1000_Pytorch',
        'ds1000_Matplotlib',
        '',
        'mmmlu_lite',
        'openai_mmmlu_lite_AR-XY',
        'openai_mmmlu_lite_BN-BD',
        'openai_mmmlu_lite_DE-DE',
        'openai_mmmlu_lite_ES-LA',
        'openai_mmmlu_lite_FR-FR',
        'openai_mmmlu_lite_HI-IN',
        'openai_mmmlu_lite_ID-ID',
        'openai_mmmlu_lite_IT-IT',
        'openai_mmmlu_lite_JA-JP',
        'openai_mmmlu_lite_KO-KR',
        'openai_mmmlu_lite_PT-BR',
        'openai_mmmlu_lite_SW-KE',
        'openai_mmmlu_lite_YO-NG',
        'openai_mmmlu_lite_ZH-CN',
        '',
        '###### MathBench-A: Application Part ######',
        'college',
        'high',
        'middle',
        'primary',
        'arithmetic',
        'mathbench-a (average)',
        '###### MathBench-T: Theory Part ######',
        'college_knowledge',
        'high_knowledge',
        'middle_knowledge',
        'primary_knowledge',
        'mathbench-t (average)',
    ],
    summary_groups=summary_groups,
 )
 for d in datasets:
    d['reader_cfg']['test_range'] = '[0:16]'
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 for m in models:
    m['abbr'] = m['abbr'] + '_fullbench'
    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
        m['engine_config']['max_batch_size'] = 1
        m['batch_size'] = 1
 models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
--- a/.github/scripts/eval_regression_chat_sub_fullbench.py
+++ b/.github/scripts/eval_regression_chat_sub_fullbench.py
@ -0,0 +1,182 @@
 from copy import deepcopy
 from mmengine.config import read_base
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.summarizers import DefaultSubjectiveSummarizer
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 with read_base():
    # read hf models - chat models
    # Dataset
    from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \
        csimpleqa_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \
        simpleqa_datasets  # noqa: F401, E501; noqa: F401, E501
    from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \
        alignbench_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \
        alpacav2_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \
        arenahard_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \
        compassarena_datasets  # noqa: F401, E501
    # from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import fofo_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \
        followbench_llmeval_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \
        mtbench101_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \
        wildbench_datasets  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
    from ...volc import infer as volc_infer  # noqa: F401, E501
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
                and 'mtbench101' not in k and 'wildbench' not in k), [])
 datasets += mtbench101_datasets  # noqa: F401, E501
 datasets += wildbench_datasets  # noqa: F401, E501
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ],
    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 for m in models:
    m['abbr'] = m['abbr'] + '_fullbench'
    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
        m['engine_config']['max_batch_size'] = 1
        m['batch_size'] = 1
 models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
 judge_models = deepcopy([models[1]])
 judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
        models=models,
        judge_models=judge_models,
    ),
    runner=dict(type=LocalRunner,
                max_num_workers=16,
                task=dict(type=SubjectiveEvalTask)),
 )
 summary_groups = []
 summary_groups.append({
    'name': 'compassarena_language',
    'subsets': [
        ['compassarena_language', '内容总结'],
    ],
 })
 summary_groups.append({
    'name': 'compassarena_knowledge',
    'subsets': [
        ['compassarena_knowledge', '生活常识_ZH'],
    ],
 })
 summary_groups.append({
    'name': 'compassarena_reason_v2',
    'subsets': [
        ['compassarena_reason_v2', 'reasoning'],
    ],
 })
 summary_groups.append({
    'name': 'compassarena_math_v2',
    'subsets': [
        ['compassarena_math_v2', '高等数学_ZH'],
    ],
 })
 summary_groups.append({
    'name': 'compassarena_creationv2_zh',
    'subsets': [
        ['compassarena_creationv2_zh', '内容扩写_ZH'],
    ],
 })
 summary_groups.append({
    'name':
    'CompassArena',
    'subsets': [
        'compassarena_language',
        'compassarena_knowledge',
        'compassarena_reason_v2',
        'compassarena_math_v2',
        'compassarena_creationv2_zh',
    ],
 })
 summary_groups.append({
    'name':
    'FoFo',
    'subsets': [['fofo_test_prompts', 'overall'],
                ['fofo_test_prompts_cn', 'overall']],
 })
 summary_groups.append({
    'name':
    'Followbench',
    'subsets': [
        ['followbench_llmeval_en', 'HSR_AVG'],
        ['followbench_llmeval_en', 'SSR_AVG'],
    ],
 })
 # Summarizer
 summarizer = dict(
    dataset_abbrs=[
        ['alignment_bench_v1_1', '总分'],
        ['alpaca_eval', 'total'],
        ['arenahard', 'score'],
        ['Followbench', 'naive_average'],
        ['CompassArena', 'naive_average'],
        ['FoFo', 'naive_average'],
        ['mtbench101', 'avg'],
        ['wildbench', 'average'],
        ['simpleqa', 'accuracy_given_attempted'],
        ['chinese_simpleqa', 'given_attempted_accuracy'],
        '',
        ['alignment_bench_v1_1', '专业能力'],
        ['alignment_bench_v1_1', '数学计算'],
        ['alignment_bench_v1_1', '基本任务'],
        ['alignment_bench_v1_1', '逻辑推理'],
        ['alignment_bench_v1_1', '中文理解'],
        ['alignment_bench_v1_1', '文本写作'],
        ['alignment_bench_v1_1', '角色扮演'],
        ['alignment_bench_v1_1', '综合问答'],
        ['alpaca_eval', 'helpful_base'],
        ['alpaca_eval', 'koala'],
        ['alpaca_eval', 'oasst'],
        ['alpaca_eval', 'selfinstruct'],
        ['alpaca_eval', 'vicuna'],
        ['compassarena_language', 'naive_average'],
        ['compassarena_knowledge', 'naive_average'],
        ['compassarena_reason_v2', 'naive_average'],
        ['compassarena_math_v2', 'naive_average'],
        ['compassarena_creationv2_zh', 'naive_average'],
        ['fofo_test_prompts', 'overall'],
        ['fofo_test_prompts_cn', 'overall'],
        ['followbench_llmeval_en', 'HSR_AVG'],
        ['followbench_llmeval_en', 'SSR_AVG'],
        ['followbench_llmeval_en', 'HSR_L1'],
        ['followbench_llmeval_en', 'HSR_L2'],
        ['followbench_llmeval_en', 'HSR_L3'],
        ['followbench_llmeval_en', 'HSR_L4'],
        ['followbench_llmeval_en', 'HSR_L5'],
        ['followbench_llmeval_en', 'SSR_L1'],
        ['followbench_llmeval_en', 'SSR_L2'],
        ['followbench_llmeval_en', 'SSR_L3'],
        ['followbench_llmeval_en', 'SSR_L4'],
        ['followbench_llmeval_en', 'SSR_L5'],
        ['simpleqa', 'f1'],
    ],
    type=DefaultSubjectiveSummarizer,
    summary_groups=summary_groups,
 )
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -0,0 +1,383 @@
 import csv
 import os
 import pytest
 import yaml
 output_path = 'regression_result_daily'
 def model_list(type):
    config_path = '.github/scripts/oc_score_baseline_testrange.yaml'
    with open(config_path) as f:
        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
    return config.get(type).keys()
 def dataset_list(model, type):
    config_path = '.github/scripts/oc_score_baseline_fullbench.yaml'
    with open(config_path) as f:
        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
    return config.get(model).get(type).keys()
@pytest.fixture()
 def baseline_scores_testrange(request):
    config_path = os.path.join(
        request.config.rootdir,
        '.github/scripts/oc_score_baseline_testrange.yaml')
    with open(config_path) as f:
        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
    return config
@pytest.fixture()
 def baseline_scores(request):
    config_path = os.path.join(request.config.rootdir,
                               '.github/scripts/oc_score_baseline.yaml')
    with open(config_path) as f:
        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
    return config
@pytest.fixture()
 def baseline_scores_fullbench(request):
    config_path = os.path.join(
        request.config.rootdir,
        '.github/scripts/oc_score_baseline_fullbench.yaml')
    with open(config_path) as f:
        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
    return config
@pytest.fixture()
 def result_scores():
    file = find_csv_files(output_path)
    if file is None:
        return None
    return read_csv_file(file)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.chat_models
 class TestChat:
    """Test cases for chat model."""
    @pytest.mark.parametrize(
        'model, dataset', [(p1, p2) for p1 in model_list('chat')
                           for p2 in ['gsm8k_accuracy', 'race-high_accuracy']])
    def test_model_dataset_score(self, baseline_scores_testrange,
                                 result_scores, model, dataset):
        base_score = baseline_scores_testrange.get('chat').get(model).get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.base_models
 class TestBase:
    """Test cases for base model."""
    @pytest.mark.parametrize('model, dataset',
                             [(p1, p2) for p1 in model_list('base') for p2 in [
                                 'gsm8k_accuracy', 'GPQA_diamond_accuracy',
                                 'race-high_accuracy', 'winogrande_accuracy'
                             ]])
    def test_model_dataset_score(self, baseline_scores_testrange,
                                 result_scores, model, dataset):
        if model in ['gemma-2b-vllm', 'gemma-7b-vllm'
                     ] and dataset != 'gsm8k_accuracy':
            return
        base_score = baseline_scores_testrange.get('base').get(model).get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.chat_obj_fullbench
 class TestChatObjFullbench:
    """Test cases for chat model."""
    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
        'internlm2_5-7b-chat-hf_fullbench',
        'internlm2_5-7b-chat-turbomind_fullbench'
    ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'objective')])
    def test_model_dataset_score(self, baseline_scores_fullbench,
                                 result_scores, model, dataset):
        base_score = baseline_scores_fullbench.get(model).get('objective').get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.chat_sub_fullbench
 class TestChatSubFullbench:
    """Test cases for chat model."""
    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
        'internlm2_5-7b-chat-hf_fullbench',
        'internlm2_5-7b-chat-turbomind_fullbench'
    ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'subjective')]
                             )
    def test_model_dataset_score(self, baseline_scores_fullbench,
                                 result_scores, model, dataset):
        base_score = baseline_scores_fullbench.get(model).get(
            'subjective').get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.base_fullbench
 class TestBaseFullbench:
    """Test cases for chat model."""
    @pytest.mark.parametrize(
        'model, dataset',
        [(p1, p2) for p1 in
         ['internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench']
         for p2 in dataset_list('internlm2_5-7b-hf_fullbench', 'objective')])
    def test_model_dataset_score(self, baseline_scores_fullbench,
                                 result_scores, model, dataset):
        base_score = baseline_scores_fullbench.get(model).get('objective').get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
@pytest.mark.api
 class TestApibench:
    """Test cases for chat model."""
    @pytest.mark.parametrize('model, dataset',
                             [('lmdeploy-api-test', 'race-middle_accuracy'),
                              ('lmdeploy-api-test', 'race-high_accuracy'),
                              ('lmdeploy-api-test', 'gsm8k_accuracy')])
    def test_api(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.volc_fullbench
 class TestVolcFullbench:
    """Test cases for chat model."""
    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
        'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
        'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
        'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
    ] for p2 in dataset_list(p1, 'objective')])
    @pytest.mark.chat_objective
    def test_chat_objective(self, baseline_scores_fullbench, result_scores,
                            model, dataset):
        base_score = baseline_scores_fullbench.get(model).get('objective').get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score, dataset)
    @pytest.mark.parametrize('model, dataset', [
        (p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
        for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'subjective')
    ])
    @pytest.mark.chat_subjective
    def test_chat_subjective(self, baseline_scores_fullbench, result_scores,
                             model, dataset):
        base_score = baseline_scores_fullbench.get(model).get(
            'subjective').get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score, dataset)
    @pytest.mark.parametrize(
        'model, dataset',
        [(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
         for p2 in dataset_list('internlm2_5-7b-turbomind', 'objective')])
    @pytest.mark.base_objective
    def test_base_objective(self, baseline_scores_fullbench, result_scores,
                            model, dataset):
        base_score = baseline_scores_fullbench.get(model).get('objective').get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score, dataset)
    @pytest.mark.parametrize(
        'model, dataset',
        [(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
         for p2 in dataset_list('internlm2_5-7b-turbomind', 'long_context')])
    @pytest.mark.base_long_context
    def test_base_long_context(self, baseline_scores_fullbench, result_scores,
                               model, dataset):
        base_score = baseline_scores_fullbench.get(model).get(
            'long_context').get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score, dataset)
    @pytest.mark.parametrize(
        'model, dataset',
        [(p1, p2)
         for p1 in ['internlm2_5-7b-chat-1m-turbomind'] for p2 in dataset_list(
             'internlm2_5-7b-chat-1m-turbomind', 'long_context')])
    @pytest.mark.chat_long_context
    def test_chat_long_context(self, baseline_scores_fullbench, result_scores,
                               model, dataset):
        base_score = baseline_scores_fullbench.get(model).get(
            'long_context').get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
 class TestCmdCase:
    @pytest.mark.case1
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b-hf', 'race-middle_accuracy'),
                              ('internlm2_5-7b-hf', 'race-high_accuracy'),
                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
    def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score, dataset)
    @pytest.mark.case2
    @pytest.mark.parametrize(
        'model, dataset',
        [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
         ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
         ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
         ('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
         ('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
         ('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
    def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score, dataset)
    @pytest.mark.case3
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b_hf', 'race-middle_accuracy'),
                              ('internlm2_5-7b_hf', 'race-high_accuracy'),
                              ('internlm2_5-7b_hf', 'demo_gsm8k_accuracy')])
    def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score, dataset)
    @pytest.mark.case4
    @pytest.mark.parametrize(
        'model, dataset',
        [('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
         ('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
         ('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
    def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score, dataset)
    @pytest.mark.case5
    @pytest.mark.parametrize(
        'model, dataset',
        [('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
         ('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
         ('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
    def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score, dataset)
 def assert_score(model_type, score, baseline, dataset: str = ''):
    if score is None or score == '-':
        assert False, 'value is none'
    if 'batch' not in model_type:
        if float(score) <= (baseline + 0.01) and float(score) >= (baseline -
                                                                  0.01):
            print(' '.join([score, 'is equal', str(baseline)]))
            assert True
        else:
            print(' '.join([score, 'is not equal', str(baseline)]))
            assert False, ' '.join([score, 'is not equal', str(baseline)])
    else:
        if dataset.startswith('dingo') or dataset.startswith(
                'GPQA') or dataset.startswith('high') or dataset.startswith(
                    'mmlu_pro_') or dataset.startswith(
                        'alpaca_eval') or dataset.startswith('compassarena_'):
            threshold = 5
        elif dataset.startswith('humanevalx') or dataset == 'large_threshold':
            threshold = 10
        else:
            threshold = 3
        if float(score) <= (baseline + threshold) and float(score) >= (
                baseline - threshold):
            print(' '.join([
                score, 'is between',
                str(baseline - threshold), 'and',
                str(baseline + threshold)
            ]))
            assert True
        else:
            print(' '.join([
                score, 'is not between',
                str(baseline - threshold), 'and',
                str(baseline + threshold)
            ]))
            assert False, ' '.join([
                score, 'is not between',
                str(baseline - threshold), 'and',
                str(baseline + threshold)
            ])
 def find_csv_files(directory):
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv') and file.startswith('summary'):
                csv_files.append(os.path.join(root, file))
    csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
    sorted_csv_files = sorted(csv_files_with_time.items(), key=lambda x: x[1])
    latest_csv_file = sorted_csv_files[-1][0]
    return latest_csv_file
 def read_csv_file(file_path):
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        filtered_data = []
        for row in reader:
            if row['metric'] is not None and 'bpb' not in row[
                    'metric'] and '_' != row['metric']:
                filtered_row = row
                filtered_row['dataset'] = row['dataset'] + '_' + row['metric']
                del filtered_row['version']
                del filtered_row['metric']
                del filtered_row['mode']
                filtered_data.append(filtered_row)
    result = {}
    for data in filtered_data:
        dataset = data.get('dataset')
        for key in data.keys():
            if key == 'dataset':
                continue
            else:
                if key in result.keys():
                    result.get(key)[dataset] = data.get(key)
                else:
                    result[key] = {dataset: data.get(key)}
    return result
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -0,0 +1,39 @@
 internlm2_5-7b-hf:
    demo_gsm8k_accuracy: 42.19
    race-middle_accuracy: 91.78
    race-high_accuracy: 90.02
 internlm2_5-7b_hf:
    demo_gsm8k_accuracy: 42.19
    race-middle_accuracy: 91.78
    race-high_accuracy: 90.02
 internlm2_5-7b-chat-lmdeploy:
    demo_gsm8k_accuracy: 84.38
    race-middle_accuracy: 92.76
    race-high_accuracy: 90.54
 internlm3-8b-instruct-lmdeploy:
    demo_gsm8k_accuracy: 73.44
    race-middle_accuracy: 93.38
    race-high_accuracy: 90.34
 internlm3-8b-instruct_hf-lmdeploy:
    demo_gsm8k_accuracy: 73.44
    race-middle_accuracy: 93.38
    race-high_accuracy: 90.34
 internlm3-8b-instruct_hf-vllm:
    demo_gsm8k_accuracy: 78.12
    race-middle_accuracy: 92.20
    race-high_accuracy: 89.88
 internlm2_5-7b-chat_hf:
    demo_gsm8k_accuracy: 87.50
    race-middle_accuracy: 92.76
    race-high_accuracy: 90.48
 lmdeploy-api-test:
    gsm8k_accuracy: 68.75
    race-middle_accuracy: 93.75
    race-high_accuracy: 93.75
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -0,0 +1,983 @@
 internlm2_5-7b-chat-hf_fullbench:
    objective:
        race-high_accuracy: 93.75
        ARC-c_accuracy: 93.75
        BoolQ_accuracy: 81.25
        triviaqa_wiki_1shot_score: 50
        nq_open_1shot_score: 25
        IFEval_Prompt-level-strict-accuracy: 50
        drop_accuracy: 81.25
        GPQA_diamond_accuracy: 25
        hellaswag_accuracy: 87.5
        TheoremQA_score: 12.50
        musr_average_naive_average: 39.58
        korbench_single_naive_average: 40
        gsm8k_accuracy: 62.50
        math_accuracy: 75
        cmo_fib_accuracy: 6.25
        aime2024_accuracy: 6.25
        wikibench-wiki-single_choice_cncircular_perf_4: 50
        sanitized_mbpp_score: 68.75
        ds1000_naive_average: 16.96
        lcb_code_generation_pass@1: 12.5
        lcb_code_execution_pass@1: 43.75
        lcb_test_output_pass@1: 18.75
        bbh-logical_deduction_seven_objects_score: 50
        bbh-multistep_arithmetic_two_score: 68.75
        mmlu-other_accuracy: 72.6
        cmmlu-china-specific_accuracy: 76.25
        mmlu_pro_math_accuracy: 25
        ds1000_Pandas_accuracy: 12.5
        ds1000_Numpy_accuracy: 0
        ds1000_Tensorflow_accuracy: 12.5
        ds1000_Scipy_accuracy: 18.75
        ds1000_Sklearn_accuracy: 18.75
        ds1000_Pytorch_accuracy: 12.5
        ds1000_Matplotlib_accuracy: 43.75
        openai_mmmlu_lite_AR-XY_accuracy: 37.5
        college_naive_average: 12.5
        college_knowledge_naive_average: 87.5
    subjective:
        alignment_bench_v1_1_总分: 0.66
        alpaca_eval_total: 20.00
        arenahard_score: 56.82
        Followbench_naive_average: 1
        CompassArena_naive_average: 43
        mtbench101_avg: 7.60
        wildbench_average: -14.58
        simpleqa_accuracy_given_attempted: 1.00
        chinese_simpleqa_given_attempted_accuracy: 0.90
        alignment_bench_v1_1_专业能力: 7.90
        alignment_bench_v1_1_数学计算: 0
        alignment_bench_v1_1_基本任务: 0
        alignment_bench_v1_1_逻辑推理: 0
        alignment_bench_v1_1_中文理解: 0
        alignment_bench_v1_1_文本写作: 0
        alignment_bench_v1_1_角色扮演: 0
        alignment_bench_v1_1_综合问答: 0
        alpaca_eval_helpful_base: 20.00
        compassarena_language_naive_average: 35
        compassarena_knowledge_naive_average: 60.00
        compassarena_reason_v2_naive_average: 40
        compassarena_math_v2_naive_average: 50.00
        compassarena_creationv2_zh_naive_average: 30
        followbench_llmeval_en_HSR_AVG: 1
        followbench_llmeval_en_SSR_AVG: 1
        followbench_llmeval_en_HSR_L1: 1
        followbench_llmeval_en_HSR_L2: 1
        followbench_llmeval_en_HSR_L3: 1
        followbench_llmeval_en_HSR_L4: 1
        followbench_llmeval_en_HSR_L5: 1
        followbench_llmeval_en_SSR_L1: 1
        followbench_llmeval_en_SSR_L2: 1
        followbench_llmeval_en_SSR_L3: 1
        followbench_llmeval_en_SSR_L4: 1
        followbench_llmeval_en_SSR_L5: 1
        simpleqa_f1: 0.12
 internlm2_5-7b-chat-turbomind_fullbench:
    objective:
        race-high_accuracy:  93.75
        ARC-c_accuracy: 93.75
        BoolQ_accuracy: 75.00
        triviaqa_wiki_1shot_score: 50
        nq_open_1shot_score: 25
        IFEval_Prompt-level-strict-accuracy: 56.25
        drop_accuracy: 75
        GPQA_diamond_accuracy: 37.50
        hellaswag_accuracy: 81.25
        TheoremQA_score: 12.5
        musr_average_naive_average: 39.58
        korbench_single_naive_average: 40
        gsm8k_accuracy: 68.75
        math_accuracy: 68.75
        cmo_fib_accuracy: 6.25
        aime2024_accuracy: 6.25
        wikibench-wiki-single_choice_cncircular_perf_4: 25
        sanitized_mbpp_score: 68.75
        ds1000_naive_average: 15.18
        lcb_code_generation_pass@1: 12.5
        lcb_code_execution_pass@1: 43.75
        lcb_test_output_pass@1: 0.00
        bbh-logical_deduction_seven_objects_score: 62.50
        bbh-multistep_arithmetic_two_score: 62.50
        mmlu-other_accuracy: 73.08
        cmmlu-china-specific_accuracy: 75.42
        mmlu_pro_math_accuracy: 25.00
        ds1000_Pandas_accuracy: 0.00
        ds1000_Numpy_accuracy: 0
        ds1000_Tensorflow_accuracy: 12.5
        ds1000_Scipy_accuracy: 18.75
        ds1000_Sklearn_accuracy: 18.75
        ds1000_Pytorch_accuracy: 12.50
        ds1000_Matplotlib_accuracy: 43.75
        openai_mmmlu_lite_AR-XY_accuracy: 37.5
        college_naive_average: 12.50
        college_knowledge_naive_average: 87.5
    subjective:
        alignment_bench_v1_1_总分: 0.72
        alpaca_eval_total: 20.00
        arenahard_score: 55.77
        Followbench_naive_average: 1
        CompassArena_naive_average: 39.00
        mtbench101_avg: 7.90
        wildbench_average: 0.00
        simpleqa_accuracy_given_attempted: 1.00
        chinese_simpleqa_given_attempted_accuracy: 1
        alignment_bench_v1_1_专业能力: 8.70
        alignment_bench_v1_1_数学计算: 0
        alignment_bench_v1_1_基本任务: 0
        alignment_bench_v1_1_逻辑推理: 0
        alignment_bench_v1_1_中文理解: 0
        alignment_bench_v1_1_文本写作: 0
        alignment_bench_v1_1_角色扮演: 0
        alignment_bench_v1_1_综合问答: 0
        alpaca_eval_helpful_base: 20.00
        compassarena_language_naive_average: 25.00
        compassarena_knowledge_naive_average: 55.00
        compassarena_reason_v2_naive_average: 35.00
        compassarena_math_v2_naive_average: 55.00
        compassarena_creationv2_zh_naive_average: 25.00
        followbench_llmeval_en_HSR_AVG: 1
        followbench_llmeval_en_SSR_AVG: 1
        followbench_llmeval_en_HSR_L1: 1
        followbench_llmeval_en_HSR_L2: 1
        followbench_llmeval_en_HSR_L3: 1
        followbench_llmeval_en_HSR_L4: 1
        followbench_llmeval_en_HSR_L5: 1
        followbench_llmeval_en_SSR_L1: 1
        followbench_llmeval_en_SSR_L2: 1
        followbench_llmeval_en_SSR_L3: 1
        followbench_llmeval_en_SSR_L4: 1
        followbench_llmeval_en_SSR_L5: 1
        simpleqa_f1: 0.12
 internlm2_5-7b-hf_fullbench:
    objective:
        race-high_accuracy: 100
        ARC-c_accuracy: 68.75
        BoolQ_accuracy: 87.5
        triviaqa_wiki_1shot_score: 43.75
        nq_open_1shot_score: 43.75
        drop_accuracy: 62.5
        GPQA_diamond_accuracy: 62.5
        hellaswag_accuracy: 93.75
        TheoremQA_score: 18.75
        winogrande_accuracy: 75
        gsm8k_accuracy: 37.5
        GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
        GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
        math_accuracy: 12.5
        wikibench-wiki-single_choice_cncircular_perf_4: 25
        sanitized_mbpp_score: 56.25
        dingo_en_192_score: 37.5
        dingo_zh_170_score: 100
        mmlu-other_accuracy: 76.92
        cmmlu-china-specific_accuracy: 84.17
        mmlu_pro_math_accuracy: 18.75
        bbh-logical_deduction_seven_objects_score: 43.75
        bbh-multistep_arithmetic_two_score: 56.25
        college_naive_average: 12.5
        college_knowledge_naive_average: 87.5
 internlm2_5-7b-turbomind_fullbench:
    objective:
        race-high_accuracy: 100
        ARC-c_accuracy: 68.75
        BoolQ_accuracy: 87.5
        triviaqa_wiki_1shot_score: 43.75
        nq_open_1shot_score: 43.75
        drop_accuracy: 62.5
        GPQA_diamond_accuracy: 68.75
        hellaswag_accuracy: 93.75
        TheoremQA_score: 18.75
        winogrande_accuracy: 87.5
        gsm8k_accuracy: 62.50
        GaokaoBench_2010-2022_Math_II_MCQs_score: 93.75
        GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
        math_accuracy: 6.25
        wikibench-wiki-single_choice_cncircular_perf_4: 0.00
        sanitized_mbpp_score: 62.50
        dingo_en_192_score: 37.50
        dingo_zh_170_score: 100.00
        mmlu-other_accuracy: 78.37
        cmmlu-china-specific_accuracy: 83.33
        mmlu_pro_math_accuracy: 18.75
        bbh-logical_deduction_seven_objects_score: 62.50
        bbh-multistep_arithmetic_two_score: 50.00
        college_naive_average: 12.5
        college_knowledge_naive_average: 87.5
 internlm2_5-7b-turbomind:
    objective:
        race-high_accuracy: 89.28
        ARC-c_accuracy: 52.2
        BoolQ_accuracy: 89.72
        triviaqa_wiki_1shot_score: 65.88
        nq_open_1shot_score: 34.82
        drop_accuracy: 68.1
        bbh_naive_average: 72.15
        GPQA_diamond_accuracy: 32.83
        hellaswag_accuracy: 88.36
        TheoremQA_score: 25
        winogrande_accuracy: 81.29
        gsm8k_accuracy: 74.68
        GaokaoBench_weighted_average: 58.19
        math_accuracy: 33.98
        Mathbench_naive_average: 48.38
        wikibench-wiki-single_choice_cncircular_perf_4: 29.1
        cmmlu_naive_average: 78.94
        mmlu_naive_average: 71.44
        mmlu_pro_naive_average: 38.18
        openai_humaneval_humaneval_pass@1: 59.76
        openai_humaneval_v2_humaneval_pass@1: 57.93
        sanitized_mbpp_score: 55.25
        dingo_en_192_score: 60.94
        dingo_zh_170_score: 67.65
        mmlu-stem_accuracy: 63.72
        mmlu-social-science_accuracy: 80.15
        mmlu-humanities_accuracy: 74.27
        mmlu-other_accuracy: 71.85
        cmmlu-stem_accuracy: 67.07
        cmmlu-social-science_accuracy: 81.49
        cmmlu-humanities_accuracy: 85.84
        cmmlu-other_accuracy: 82.69
        cmmlu-china-specific_accuracy: 79.88
        mmlu_pro_biology_accuracy: 58.58
        mmlu_pro_business_accuracy: 28.01
        mmlu_pro_chemistry_accuracy: 22.79
        mmlu_pro_computer_science_accuracy: 39.02
        mmlu_pro_economics_accuracy: 53.08
        mmlu_pro_engineering_accuracy: 25.7
        mmlu_pro_health_accuracy: 46.94
        mmlu_pro_history_accuracy: 43.04
        mmlu_pro_law_accuracy: 29.7
        mmlu_pro_math_accuracy: 24.2
        mmlu_pro_philosophy_accuracy: 42.48
        mmlu_pro_physics_accuracy: 26.02
        mmlu_pro_psychology_accuracy: 52.76
        mmlu_pro_other_accuracy: 42.21
        college_naive_average: 7.00
        high_naive_average: 6.67
        middle_naive_average: 26.67
        primary_naive_average: 64.00
        arithmetic_naive_average: 55
        mathbench-a (average)_naive_average: 31.8
        college_knowledge_naive_average: 58.23
        high_knowledge_naive_average: 52.51
        middle_knowledge_naive_average: 71.15
        primary_knowledge_naive_average: 60.48
        mathbench-t (average)_naive_average: 60.19
    long_context:
        Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
        Single-Needle-Retrieval-EN-32000_naive_average: 100
        Single-Needle-Retrieval-ZH-32000_naive_average: 100
        Single-Needle-Retrieval(S-RT)-100000_naive_average: 100
        Single-Needle-Retrieval-EN-100000_naive_average: 100
        Single-Needle-Retrieval-ZH-100000_naive_average: 100
        Single-Needle-Retrieval(S-RT)-200000_naive_average: 100
        Single-Needle-Retrieval-EN-200000_naive_average: 100
        Single-Needle-Retrieval-ZH-200000_naive_average: 100
        longbench_naive_average: 46.19
        longbench_zh_naive_average: 49.3
        longbench_en_naive_average: 43.97
        longbench_single-document-qa_score: 42.84
        longbench_multi-document-qa_score: 41.25
        longbench_summarization_score: 23.21
        longbench_few-shot-learning_score: 61.67
        longbench_synthetic-tasks_score: 60.05
        longbench_code-completion_score: 52.09
 internlm2_5-7b-chat-turbomind:
    objective:
        race-high_accuracy: 86.16
        ARC-c_accuracy: 90.17
        BoolQ_accuracy: 87.89
        triviaqa_wiki_1shot_score: 64.91
        nq_open_1shot_score: 22.69
        mmmlu_lite_naive_average: 44.96
        IFEval_Prompt-level-strict-accuracy: 58.04
        drop_accuracy: 77.68
        bbh_naive_average: 73.14
        GPQA_diamond_accuracy: 31.06
        hellaswag_accuracy: 94.79
        TheoremQA_score: 22.25
        musr_average_naive_average: 50.89
        korbench_single_naive_average: 32.16
        ARC_Prize_Public_Evaluation_accuracy: 0.02
        gsm8k_accuracy: 86.73
        GaokaoBench_weighted_average: 78.6
        math_accuracy: 61
        cmo_fib_accuracy: 11
        aime2024_accuracy: 3.33
        Mathbench_naive_average: 64.23
        wikibench-wiki-single_choice_cncircular_perf_4: 31.32
        cmmlu_naive_average: 74.3
        mmlu_naive_average: 70.84
        mmlu_pro_naive_average: 44.98
        openai_humaneval_humaneval_pass@1: 69.8
        sanitized_mbpp_score: 64.4
        humanevalx_naive_average: 33.35
        ds1000_naive_average: 14.15
        lcb_code_generation_pass@1: 17.75
        lcb_code_execution_pass@1: 32.57
        lcb_test_output_pass@1: 26.13
        bigcodebench_hard_instruct_pass@1: 3.38
        bigcodebench_hard_complete_pass@1: 5.06
        teval_naive_average: 80
        SciCode_sub_accuracy: 5.56
        qa_dingo_cn_score: 99.01
        mmlu-stem_accuracy: 68.2
        mmlu-social-science_accuracy: 75.8
        mmlu-humanities_accuracy: 69.3
        mmlu-other_accuracy: 71.3
        cmmlu-stem_accuracy: 66.64
        cmmlu-social-science_accuracy: 76
        cmmlu-humanities_accuracy: 77.9
        cmmlu-other_accuracy: 77.25
        cmmlu-china-specific_accuracy: 73.6
        mmlu_pro_biology_accuracy: 66.67
        mmlu_pro_business_accuracy: 47.91
        mmlu_pro_chemistry_accuracy: 35
        mmlu_pro_computer_science_accuracy: 48.9
        mmlu_pro_economics_accuracy: 55.87
        mmlu_pro_engineering_accuracy: 29.62
        mmlu_pro_health_accuracy: 45
        mmlu_pro_history_accuracy: 40.8
        mmlu_pro_law_accuracy: 25.79
        mmlu_pro_math_accuracy: 53.48
        mmlu_pro_philosophy_accuracy: 38.38
        mmlu_pro_physics_accuracy: 37.79
        mmlu_pro_psychology_accuracy: 58.39
        mmlu_pro_other_accuracy: 46.27
        humanevalx-python_pass@1: 53.66
        humanevalx-cpp_pass@1: 22.56
        humanevalx-go_pass@1: 0
        humanevalx-js_pass@1: 54.88
        ds1000_Pandas_accuracy: 10.65
        ds1000_Numpy_accuracy: 3.63
        ds1000_Tensorflow_accuracy: 13.33
        ds1000_Scipy_accuracy: 8.96
        ds1000_Sklearn_accuracy: 6.96
        ds1000_Pytorch_accuracy: 6.62
        ds1000_Matplotlib_accuracy: 49.35
        openai_mmmlu_lite_AR-XY_accuracy: 17.19
        openai_mmmlu_lite_BN-BD_accuracy: 26.78
        openai_mmmlu_lite_DE-DE_accuracy: 51.27
        openai_mmmlu_lite_ES-LA_accuracy: 56.94
        openai_mmmlu_lite_FR-FR_accuracy: 58.22
        openai_mmmlu_lite_HI-IN_accuracy: 30.75
        openai_mmmlu_lite_ID-ID_accuracy: 50.6
        openai_mmmlu_lite_IT-IT_accuracy: 50.6
        openai_mmmlu_lite_JA-JP_accuracy: 51.13
        openai_mmmlu_lite_KO-KR_accuracy: 45
        openai_mmmlu_lite_PT-BR_accuracy: 57.68
        openai_mmmlu_lite_SW-KE_accuracy: 32.56
        openai_mmmlu_lite_YO-NG_accuracy: 32.42
        openai_mmmlu_lite_ZH-CN_accuracy: 65.4
        college_naive_average: 19.17
        high_naive_average: 46.5
        middle_naive_average: 61.34
        primary_naive_average: 73.34
        arithmetic_naive_average: 61.67
        mathbench-a (average)_naive_average: 52.58
        college_knowledge_naive_average: 67.1
        high_knowledge_naive_average: 70
        middle_knowledge_naive_average: 80
        primary_knowledge_naive_average: 90.12
        mathbench-t (average)_naive_average: 76
    subjective:
        alignment_bench_v1_1_总分: 5.68
        alpaca_eval_total: 25.96
        arenahard_score: 17.15
        Followbench_naive_average: 0.81
        CompassArena_naive_average: 39.49
        FoFo_naive_average: 0.38
        mtbench101_avg: 8.01
        wildbench_average: -10.49
        simpleqa_accuracy_given_attempted: 0.04
        chinese_simpleqa_given_attempted_accuracy: 0.34
        alignment_bench_v1_1_专业能力: 6.05
        alignment_bench_v1_1_数学计算: 5.87
        alignment_bench_v1_1_基本任务: 6.01
        alignment_bench_v1_1_逻辑推理: 4.48
        alignment_bench_v1_1_中文理解: 6.17
        alignment_bench_v1_1_文本写作: 6.06
        alignment_bench_v1_1_角色扮演: 6.3
        alignment_bench_v1_1_综合问答: 6.45
        alpaca_eval_helpful_base: 17.83
        alpaca_eval_koala: 28.21
        alpaca_eval_oasst: 23.4
        alpaca_eval_selfinstruct: 30.95
        alpaca_eval_vicuna: 25.00
        compassarena_language_naive_average: 53.00
        compassarena_knowledge_naive_average: 36
        compassarena_reason_v2_naive_average: 35
        compassarena_math_v2_naive_average: 16.07
        compassarena_creationv2_zh_naive_average: 43.64
        fofo_test_prompts_overall: 0.35
        fofo_test_prompts_cn_overall: 0.41
        followbench_llmeval_en_HSR_AVG: 0.73
        followbench_llmeval_en_SSR_AVG: 0.88
        followbench_llmeval_en_HSR_L1: 0.94
        followbench_llmeval_en_HSR_L2: 0.77
        followbench_llmeval_en_HSR_L3: 0.73
        followbench_llmeval_en_HSR_L4: 0.68
        followbench_llmeval_en_HSR_L5: 0.54
        followbench_llmeval_en_SSR_L1: 0.94
        followbench_llmeval_en_SSR_L2: 0.88
        followbench_llmeval_en_SSR_L3: 0.87
        followbench_llmeval_en_SSR_L4: 0.87
        followbench_llmeval_en_SSR_L5: 0.85
        simpleqa_f1: 0.04
 internlm2_5-7b-chat-1m-turbomind:
    long_context:
        ruler_8k_naive_average: 88.53
        ruler_32k_naive_average: 83.84
        ruler_128k_naive_average: 70.94
        NeedleBench-Overall-Score-8K_weighted_average: 91.89
        NeedleBench-Overall-Score-32K_weighted_average: 91.42
        NeedleBench-Overall-Score-128K_weighted_average: 88.57
        longbench_naive_average: 46.44
        longbench_zh_naive_average: 45.19
        longbench_en_naive_average: 45.71
        babilong_0k_naive_average: 79.3
        babilong_4k_naive_average: 67
        babilong_16k_naive_average: 52.7
        babilong_32k_naive_average: 48.9
        babilong_128k_naive_average: 40.8
        babilong_256k_naive_average: 23.5
        longbench_single-document-qa_score: 43.56
        longbench_multi-document-qa_score: 46.24
        longbench_summarization_score: 24.32
        longbench_few-shot-learning_score: 51.67
        longbench_synthetic-tasks_score: 66.83
        longbench_code-completion_score: 45.99
 qwen2.5-7b-instruct-turbomind:
    objective:
        race-high_accuracy: 84.99
        ARC-c_accuracy: 92.2
        BoolQ_accuracy: 86.7
        triviaqa_wiki_1shot_score: 53.06
        nq_open_1shot_score: 17.51
        mmmlu_lite_naive_average: 54.96
        IFEval_Prompt-level-strict-accuracy: 71.53
        drop_accuracy: 80.07
        bbh_naive_average: 68.81
        GPQA_diamond_accuracy: 34.34
        hellaswag_accuracy: 85.42
        TheoremQA_score: 18.38
        musr_average_naive_average: 43.44
        korbench_single_naive_average: 39.44
        ARC_Prize_Public_Evaluation_accuracy: 0
        gsm8k_accuracy: 92.57
        GaokaoBench_weighted_average: 80.14
        math_accuracy: 73.58
        cmo_fib_accuracy: 25
        aime2024_accuracy: 16.67
        Mathbench_naive_average: 77.33
        wikibench-wiki-single_choice_cncircular_perf_4: 34.9
        cmmlu_naive_average: 75.97
        mmlu_naive_average: 76.01
        mmlu_pro_naive_average: 56.12
        openai_humaneval_humaneval_pass@1: 83.54
        sanitized_mbpp_score: 74.71
        humanevalx_naive_average: 48.29
        ds1000_naive_average: 18.66
        lcb_code_generation_pass@1: 39.5
        lcb_code_execution_pass@1: 42.38
        lcb_test_output_pass@1: 50.68
        bigcodebench_hard_instruct_pass@1: 16.22
        bigcodebench_hard_complete_pass@1: 11.49
        teval_naive_average: 79.72
        SciCode_sub_accuracy: 10.76
        qa_dingo_cn_score: 99.01
        mmlu_accuracy: 76.01
        mmlu-stem_accuracy: 77.59
        mmlu-social-science_accuracy: 79.02
        mmlu-humanities_accuracy: 72.07
        mmlu-other_accuracy: 74.86
        cmmlu_accuracy: 75.97
        cmmlu-stem_accuracy: 73.09
        cmmlu-social-science_accuracy: 75.95
        cmmlu-humanities_accuracy: 76.53
        cmmlu-other_accuracy: 78.79
        cmmlu-china-specific_accuracy: 73.17
        mmlu_pro_accuracy: 56.12
        mmlu_pro_biology_accuracy: 71.41
        mmlu_pro_business_accuracy: 67.68
        mmlu_pro_chemistry_accuracy: 54.59
        mmlu_pro_computer_science_accuracy: 58.29
        mmlu_pro_economics_accuracy: 66.82
        mmlu_pro_engineering_accuracy: 42.41
        mmlu_pro_health_accuracy: 55.87
        mmlu_pro_history_accuracy: 46.46
        mmlu_pro_law_accuracy: 28.97
        mmlu_pro_math_accuracy: 73.13
        mmlu_pro_philosophy_accuracy: 44.89
        mmlu_pro_physics_accuracy: 58.43
        mmlu_pro_psychology_accuracy: 63.16
        mmlu_pro_other_accuracy: 53.57
        humanevalx-python_pass@1: 50
        humanevalx-cpp_pass@1: 42.07
        humanevalx-go_pass@1: 0
        humanevalx-java_pass@1: 53.05
        humanevalx-js_pass@1: 75
        ds1000_Pandas_accuracy: 14.09
        ds1000_Numpy_accuracy: 8.18
        ds1000_Tensorflow_accuracy: 17.78
        ds1000_Scipy_accuracy: 15.09
        ds1000_Sklearn_accuracy: 10.43
        ds1000_Pytorch_accuracy: 4.41
        ds1000_Matplotlib_accuracy: 60.65
        mmmlu_lite_accuracy: 54.96
        openai_mmmlu_lite_AR-XY_accuracy: 42.32
        openai_mmmlu_lite_BN-BD_accuracy: 42.25
        openai_mmmlu_lite_DE-DE_accuracy: 59.93
        openai_mmmlu_lite_ES-LA_accuracy: 66.53
        openai_mmmlu_lite_FR-FR_accuracy: 66.88
        openai_mmmlu_lite_HI-IN_accuracy: 49.26
        openai_mmmlu_lite_ID-ID_accuracy: 61.26
        openai_mmmlu_lite_IT-IT_accuracy: 65.47
        openai_mmmlu_lite_JA-JP_accuracy: 61.54
        openai_mmmlu_lite_KO-KR_accuracy: 60.28
        openai_mmmlu_lite_PT-BR_accuracy: 55.51
        openai_mmmlu_lite_SW-KE_accuracy: 36.42
        openai_mmmlu_lite_YO-NG_accuracy: 32.14
        openai_mmmlu_lite_ZH-CN_accuracy: 69.61
        college_naive_average: 44.33
        high_naive_average: 59
        middle_naive_average: 78
        primary_naive_average: 85.67
        arithmetic_naive_average: 75.67
        mathbench-a (average)_naive_average: 69.27
        college_knowledge_naive_average: 83.86
        high_knowledge_naive_average: 80.29
        middle_knowledge_naive_average: 84.26
        primary_knowledge_naive_average: 93.16
        mathbench-t (average)_naive_average: 85.39
 internlm2_5-7b-chat-pytorch:
    objective:
        race-high_accuracy: 86.39
        ARC-c_accuracy: 90.51
        BoolQ_accuracy: 88.01
        triviaqa_wiki_1shot_score: 64.77
        nq_open_1shot_score: 22.71
        mmmlu_lite_naive_average: 45.02
        IFEval_Prompt-level-strict-accuracy: 56.56
        drop_accuracy: 75.46
        bbh_naive_average: 73.34
        GPQA_diamond_accuracy: 32.83
        hellaswag_accuracy: 94.81
        TheoremQA_score: 23.88
        musr_average_naive_average: 51.31
        korbench_single_naive_average: 32
        ARC_Prize_Public_Evaluation_accuracy: 0.01
        gsm8k_accuracy: 86.96
        GaokaoBench_weighted_average: 78.05
        math_accuracy: 60.34
        cmo_fib_accuracy: 12.98
        aime2024_accuracy: 3.33
        Mathbench_naive_average: 64.82
        wikibench-wiki-single_choice_cncircular_perf_4: 31.7
        cmmlu_naive_average: 74.24
        mmlu_naive_average: 70.2
        mmlu_pro_naive_average: 45.39
        openai_humaneval_humaneval_pass@1: 70.12
        sanitized_mbpp_score: 64.59
        humanevalx_naive_average: 38.78
        ds1000_naive_average: 14.19
        lcb_code_generation_pass@1: 16.5
        lcb_code_execution_pass@1: 33.82
        lcb_test_output_pass@1: 22.62
        bigcodebench_hard_instruct_pass@1: 6.08
        bigcodebench_hard_complete_pass@1: 6.76
        teval_naive_average: 79.73
        SciCode_sub_accuracy: 3.47
        qa_dingo_cn_score: 100
        mmlu_accuracy: 70.2
        mmlu-stem_accuracy: 67.73
        mmlu-social-science_accuracy: 75.49
        mmlu-humanities_accuracy: 68.56
        mmlu-other_accuracy: 70.58
        cmmlu_accuracy: 74.24
        cmmlu-stem_accuracy: 66.7
        cmmlu-social-science_accuracy: 75.88
        cmmlu-humanities_accuracy: 77.56
        cmmlu-other_accuracy: 77.52
        cmmlu-china-specific_accuracy: 73.46
        mmlu_pro_accuracy: 45.39
        mmlu_pro_biology_accuracy: 65.83
        mmlu_pro_business_accuracy: 51.96
        mmlu_pro_chemistry_accuracy: 36.84
        mmlu_pro_computer_science_accuracy: 48.29
        mmlu_pro_economics_accuracy: 56.16
        mmlu_pro_engineering_accuracy: 29.1
        mmlu_pro_health_accuracy: 44.5
        mmlu_pro_history_accuracy: 42.26
        mmlu_pro_law_accuracy: 24.98
        mmlu_pro_math_accuracy: 54.85
        mmlu_pro_philosophy_accuracy: 39.28
        mmlu_pro_physics_accuracy: 37.41
        mmlu_pro_psychology_accuracy: 58.27
        mmlu_pro_other_accuracy: 45.78
        humanevalx-python_pass@1: 56.1
        humanevalx-cpp_pass@1: 20.73
        humanevalx-go_pass@1: 0
        humanevalx-java_pass@1: 59.15
        humanevalx-js_pass@1: 57.93
        ds1000_Pandas_accuracy: 8.93
        ds1000_Numpy_accuracy: 4.09
        ds1000_Tensorflow_accuracy: 11.11
        ds1000_Scipy_accuracy: 7.55
        ds1000_Sklearn_accuracy: 7.83
        ds1000_Pytorch_accuracy: 8.82
        ds1000_Matplotlib_accuracy: 50.97
        mmmlu_lite_accuracy: 45.02
        openai_mmmlu_lite_AR-XY_accuracy: 18.6
        openai_mmmlu_lite_BN-BD_accuracy: 27.58
        openai_mmmlu_lite_DE-DE_accuracy: 51.23
        openai_mmmlu_lite_ES-LA_accuracy: 56.63
        openai_mmmlu_lite_FR-FR_accuracy: 58.11
        openai_mmmlu_lite_HI-IN_accuracy: 33.82
        openai_mmmlu_lite_ID-ID_accuracy: 50.39
        openai_mmmlu_lite_IT-IT_accuracy: 50.39
        openai_mmmlu_lite_JA-JP_accuracy: 50.95
        openai_mmmlu_lite_KO-KR_accuracy: 45.05
        openai_mmmlu_lite_PT-BR_accuracy: 57.89
        openai_mmmlu_lite_SW-KE_accuracy: 32.14
        openai_mmmlu_lite_YO-NG_accuracy: 32.14
        openai_mmmlu_lite_ZH-CN_accuracy: 65.33
        college_naive_average: 21
        high_naive_average: 47
        middle_naive_average: 59.67
        primary_naive_average: 72.33
        arithmetic_naive_average: 62
        mathbench-a (average)_naive_average: 53.13
        college_knowledge_naive_average: 68.99
        high_knowledge_naive_average: 70.06
        middle_knowledge_naive_average: 78.53
        primary_knowledge_naive_average: 88.49
        mathbench-t (average)_naive_average: 76.51
 qwen2.5-7b-instruct-pytorch:
    objective:
        race-high_accuracy: 85.16
        ARC-c_accuracy: 90.85
        BoolQ_accuracy: 86.61
        triviaqa_wiki_1shot_score: 52.96
        nq_open_1shot_score: 17.62
        mmmlu_lite_naive_average: 54.7
        IFEval_Prompt-level-strict-accuracy: 71.35
        drop_accuracy: 80.23
        bbh_naive_average: 68.88
        GPQA_diamond_accuracy: 36.36
        hellaswag_accuracy: 85.49
        TheoremQA_score: 18.38
        musr_average_naive_average: 43.3
        korbench_single_naive_average: 39.44
        ARC_Prize_Public_Evaluation_accuracy: 0
        gsm8k_accuracy: 91.66
        GaokaoBench_weighted_average: 80.02
        math_accuracy: 73.74
        cmo_fib_accuracy: 22.60
        aime2024_accuracy: 13.33
        Mathbench_naive_average: 77.08
        wikibench-wiki-single_choice_cncircular_perf_4: 34
        cmmlu_naive_average: 75.9
        mmlu_naive_average: 76.27
        mmlu_pro_naive_average: 56.14
        openai_humaneval_humaneval_pass@1: 84.76
        sanitized_mbpp_score: 74.71
        humanevalx_naive_average: 48.17
        ds1000_naive_average: 18.57
        lcb_code_generation_pass@1: 38.75
        lcb_code_execution_pass@1: 42.38
        lcb_test_output_pass@1: 50.45
        bigcodebench_hard_instruct_pass@1: 16.89
        bigcodebench_hard_complete_pass@1: 12.16
        teval_naive_average: 79.46
        SciCode_sub_accuracy: 10.42
        qa_dingo_cn_score: 100
        mmlu_accuracy: 76.27
        mmlu-stem_accuracy: 77.75
        mmlu-social-science_accuracy: 78.65
        mmlu-humanities_accuracy: 73.12
        mmlu-other_accuracy: 75.05
        cmmlu_accuracy: 75.9
        cmmlu-stem_accuracy: 73.41
        cmmlu-social-science_accuracy: 75.97
        cmmlu-humanities_accuracy: 76.42
        cmmlu-other_accuracy: 78.15
        cmmlu-china-specific_accuracy: 73.27
        mmlu_pro_accuracy: 56.14
        mmlu_pro_biology_accuracy: 72.25
        mmlu_pro_business_accuracy: 66.16
        mmlu_pro_chemistry_accuracy: 55.65
        mmlu_pro_computer_science_accuracy: 60.24
        mmlu_pro_economics_accuracy: 66.82
        mmlu_pro_engineering_accuracy: 41.38
        mmlu_pro_health_accuracy: 54.89
        mmlu_pro_history_accuracy: 46.46
        mmlu_pro_law_accuracy: 29.06
        mmlu_pro_math_accuracy: 73.58
        mmlu_pro_philosophy_accuracy: 44.89
        mmlu_pro_physics_accuracy: 60.05
        mmlu_pro_psychology_accuracy: 61.9
        mmlu_pro_other_accuracy: 52.6
        humanevalx-python_pass@1: 51.83
        humanevalx-cpp_pass@1: 42.68
        humanevalx-go_pass@1: 0
        humanevalx-java_pass@1: 73.78
        humanevalx-js_pass@1: 72.56
        ds1000_Pandas_accuracy: 14.09
        ds1000_Numpy_accuracy: 8.64
        ds1000_Tensorflow_accuracy: 17.78
        ds1000_Scipy_accuracy: 15.09
        ds1000_Sklearn_accuracy: 8.7
        ds1000_Pytorch_accuracy: 4.41
        ds1000_Matplotlib_accuracy: 61.29
        mmmlu_lite_accuracy: 54.7
        openai_mmmlu_lite_AR-XY_accuracy: 42.32
        openai_mmmlu_lite_BN-BD_accuracy: 42.18
        openai_mmmlu_lite_DE-DE_accuracy: 60
        openai_mmmlu_lite_ES-LA_accuracy: 66.18
        openai_mmmlu_lite_FR-FR_accuracy: 66.88
        openai_mmmlu_lite_HI-IN_accuracy: 48.63
        openai_mmmlu_lite_ID-ID_accuracy: 61.26
        openai_mmmlu_lite_IT-IT_accuracy: 65.26
        openai_mmmlu_lite_JA-JP_accuracy: 60.7
        openai_mmmlu_lite_KO-KR_accuracy: 60.63
        openai_mmmlu_lite_PT-BR_accuracy: 54.46
        openai_mmmlu_lite_SW-KE_accuracy: 36
        openai_mmmlu_lite_YO-NG_accuracy: 31.86
        openai_mmmlu_lite_ZH-CN_accuracy: 69.4
        college_naive_average: 48.33
        high_naive_average: 59.33
        middle_naive_average: 76.67
        primary_naive_average: 86.67
        arithmetic_naive_average: 74.33
        mathbench-a (average)_naive_average: 69.07
        college_knowledge_naive_average: 83.54
        high_knowledge_naive_average: 80.82
        middle_knowledge_naive_average: 83.79
        primary_knowledge_naive_average: 92.22
        mathbench-t (average)_naive_average: 85.1
 internlm3-8b-instruct-turbomind:
    objective:
        race-high_accuracy: 89.22
        ARC-c_accuracy: 92.54
        BoolQ_accuracy: 86.45
        triviaqa_wiki_1shot_score: 60.72
        nq_open_1shot_score: 20.25
        mmmlu_lite_naive_average: 41.82
        IFEval_Prompt-level-strict-accuracy: 77.45
        drop_accuracy: 83.27
        bbh_naive_average: 55.22
        GPQA_diamond_accuracy: 37.88
        hellaswag_accuracy: 91.28
        TheoremQA_score: 20.12
        musr_average_naive_average: 36.86
        korbench_single_naive_average: 41.2
        ARC_Prize_Public_Evaluation_accuracy: 0.06
        gsm8k_accuracy: 91.28
        GaokaoBench_weighted_average: 86.59
        math_accuracy: 76.96
        cmo_fib_accuracy: 38.46
        aime2024_accuracy: 13.33
        Mathbench_naive_average: 78.96
        wikibench-wiki-single_choice_cncircular_perf_4: 37.45
        cmmlu_naive_average: 83.33
        mmlu_naive_average: 76.21
        mmlu_pro_naive_average: 57.96
        openai_humaneval_humaneval_pass@1: 81.71
        sanitized_mbpp_score: 69.65
        humanevalx_naive_average: 40.73
        ds1000_naive_average: 27.23
        lcb_code_generation_pass@1: 34.75
        lcb_code_execution_pass@1: 49.9
        lcb_test_output_pass@1: 48.19
        bigcodebench_hard_instruct_pass@1: 13.51
        bigcodebench_hard_complete_pass@1: 15.54
        teval_naive_average: 82.86
        SciCode_sub_accuracy: 11.11
        qa_dingo_cn_score: 100
        mmlu_accuracy: 76.21
        mmlu-stem_accuracy: 77.7
        mmlu-social-science_accuracy: 80.98
        mmlu-humanities_accuracy: 70.83
        mmlu-other_accuracy: 75.01
        cmmlu_accuracy: 83.33
        cmmlu-stem_accuracy: 79.66
        cmmlu-social-science_accuracy: 83.39
        cmmlu-humanities_accuracy: 84.73
        cmmlu-other_accuracy: 86.2
        cmmlu-china-specific_accuracy: 81.77
        mmlu_pro_accuracy: 57.96
        mmlu_pro_biology_accuracy: 75.45
        mmlu_pro_business_accuracy: 64.64
        mmlu_pro_chemistry_accuracy: 59.81
        mmlu_pro_computer_science_accuracy: 60.24
        mmlu_pro_economics_accuracy: 68.6
        mmlu_pro_engineering_accuracy: 44.79
        mmlu_pro_health_accuracy: 58.31
        mmlu_pro_history_accuracy: 49.87
        mmlu_pro_law_accuracy: 32.43
        mmlu_pro_math_accuracy: 70.17
        mmlu_pro_philosophy_accuracy: 46.89
        mmlu_pro_physics_accuracy: 59.58
        mmlu_pro_psychology_accuracy: 66.29
        mmlu_pro_other_accuracy: 54.33
        humanevalx-python_pass@1: 43.9
        humanevalx-cpp_pass@1: 20.12
        humanevalx-go_pass@1: 0
        humanevalx-java_pass@1: 40.85
        humanevalx-js_pass@1: 65.24
        ds1000_Pandas_accuracy: 16.49
        ds1000_Numpy_accuracy: 34.09
        ds1000_Tensorflow_accuracy: 26.67
        ds1000_Scipy_accuracy: 17.92
        ds1000_Sklearn_accuracy: 20.87
        ds1000_Pytorch_accuracy: 19.12
        ds1000_Matplotlib_accuracy: 55.48
        mmmlu_lite_accuracy: 41.82
        openai_mmmlu_lite_AR-XY_accuracy: 32.56
        openai_mmmlu_lite_BN-BD_accuracy: 4.56
        openai_mmmlu_lite_DE-DE_accuracy: 24.91
        openai_mmmlu_lite_ES-LA_accuracy: 51.09
        openai_mmmlu_lite_FR-FR_accuracy: 61.68
        openai_mmmlu_lite_HI-IN_accuracy: 24.98
        openai_mmmlu_lite_ID-ID_accuracy: 44.56
        openai_mmmlu_lite_IT-IT_accuracy: 52.35
        openai_mmmlu_lite_JA-JP_accuracy: 51.02
        openai_mmmlu_lite_KO-KR_accuracy: 47.93
        openai_mmmlu_lite_PT-BR_accuracy: 53.89
        openai_mmmlu_lite_SW-KE_accuracy: 33.47
        openai_mmmlu_lite_YO-NG_accuracy: 33.47
        openai_mmmlu_lite_ZH-CN_accuracy: 69.05
        college_naive_average: 45.67
        high_naive_average: 64.67
        middle_naive_average: 82.33
        primary_naive_average: 90.33
        arithmetic_naive_average: 74
        mathbench-a (average)_naive_average: 71.4
        college_knowledge_naive_average: 85.28
        high_knowledge_naive_average: 79.43
        middle_knowledge_naive_average: 87.9
        primary_knowledge_naive_average: 93.42
        mathbench-t (average)_naive_average: 86.51
 internlm3-8b-instruct-pytorch:
    objective:
        race-high_accuracy: 89.02
        ARC-c_accuracy: 93.56
        BoolQ_accuracy: 86.67
        triviaqa_wiki_1shot_score: 60.54
        nq_open_1shot_score: 20.3
        mmmlu_lite_naive_average: 42.6
        IFEval_Prompt-level-strict-accuracy: 79.11
        drop_accuracy: 83.32
        bbh_naive_average: 54.76
        GPQA_diamond_accuracy: 33.84
        hellaswag_accuracy: 91.31
        TheoremQA_score: 18
        musr_average_naive_average: 36.62
        korbench_single_naive_average: 41.84
        ARC_Prize_Public_Evaluation_accuracy: 0.06
        gsm8k_accuracy: 90.67
        GaokaoBench_weighted_average: 86.27
        math_accuracy: 76.68
        cmo_fib_accuracy: 33.65
        aime2024_accuracy: 10
        Mathbench_naive_average: 78.92
        wikibench-wiki-single_choice_cncircular_perf_4: 37.35
        cmmlu_naive_average: 83.11
        mmlu_naive_average: 76.23
        mmlu_pro_naive_average: 58.16
        openai_humaneval_humaneval_pass@1: 82.32
        sanitized_mbpp_score: 70.04
        humanevalx_naive_average: 25.49
        ds1000_naive_average: 27.84
        lcb_code_generation_pass@1: 34.5
        lcb_code_execution_pass@1: 48.02
        lcb_test_output_pass@1: 47.74
        bigcodebench_hard_instruct_pass@1: 12.84
        bigcodebench_hard_complete_pass@1: 15.54
        teval_naive_average: 82.86
        SciCode_sub_accuracy: 9.38
        qa_dingo_cn_score: 100
        mmlu_accuracy: 76.23
        mmlu-stem_accuracy: 78.08
        mmlu-social-science_accuracy: 80.31
        mmlu-humanities_accuracy: 71.38
        mmlu-other_accuracy: 74.63
        cmmlu_accuracy: 83.11
        cmmlu-stem_accuracy: 79.42
        cmmlu-social-science_accuracy: 83.34
        cmmlu-humanities_accuracy: 83.95
        cmmlu-other_accuracy: 86.22
        cmmlu-china-specific_accuracy: 81.5
        mmlu_pro_accuracy: 58.16
        mmlu_pro_biology_accuracy: 74.62
        mmlu_pro_business_accuracy: 65.02
        mmlu_pro_chemistry_accuracy: 60.69
        mmlu_pro_computer_science_accuracy: 61.46
        mmlu_pro_economics_accuracy: 68.25
        mmlu_pro_engineering_accuracy: 45.3
        mmlu_pro_health_accuracy: 60.15
        mmlu_pro_history_accuracy: 50.66
        mmlu_pro_law_accuracy: 31.7
        mmlu_pro_math_accuracy: 70.32
        mmlu_pro_philosophy_accuracy: 47.7
        mmlu_pro_physics_accuracy: 59.51
        mmlu_pro_psychology_accuracy: 65.41
        mmlu_pro_other_accuracy: 53.46
        humanevalx-python_pass@1: 42.68
        humanevalx-cpp_pass@1: 19.51
        humanevalx-go_pass@1: 0
        humanevalx-java_pass@1: 0.00
        humanevalx-js_pass@1: 64.02
        ds1000_Pandas_accuracy: 14.09
        ds1000_Numpy_accuracy: 35
        ds1000_Tensorflow_accuracy: 24.44
        ds1000_Scipy_accuracy: 20.75
        ds1000_Sklearn_accuracy: 21.74
        ds1000_Pytorch_accuracy: 22.06
        ds1000_Matplotlib_accuracy: 56.77
        mmmlu_lite_accuracy: 42.6
        openai_mmmlu_lite_AR-XY_accuracy: 32.84
        openai_mmmlu_lite_BN-BD_accuracy: 10.46
        openai_mmmlu_lite_DE-DE_accuracy: 24.56
        openai_mmmlu_lite_ES-LA_accuracy: 50.95
        openai_mmmlu_lite_FR-FR_accuracy: 61.05
        openai_mmmlu_lite_HI-IN_accuracy: 30.6
        openai_mmmlu_lite_ID-ID_accuracy: 45.89
        openai_mmmlu_lite_IT-IT_accuracy: 51.79
        openai_mmmlu_lite_JA-JP_accuracy: 51.65
        openai_mmmlu_lite_KO-KR_accuracy: 48.77
        openai_mmmlu_lite_PT-BR_accuracy: 52.7
        openai_mmmlu_lite_SW-KE_accuracy: 32.91
        openai_mmmlu_lite_YO-NG_accuracy: 32.84
        openai_mmmlu_lite_ZH-CN_accuracy: 69.33
        college_naive_average: 47
        high_naive_average: 66.67
        middle_naive_average: 81.67
        primary_naive_average: 89.33
        arithmetic_naive_average: 73.67
        mathbench-a (average)_naive_average: 71.67
        college_knowledge_naive_average: 82.91
        high_knowledge_naive_average: 79.86
        middle_knowledge_naive_average: 88.92
        primary_knowledge_naive_average: 92.96
        mathbench-t (average)_naive_average: 86.16
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@ -0,0 +1,432 @@
 chat:
    glm-4-9b-chat-hf:
        gsm8k_accuracy: 56.25
        race-high_accuracy: 84.38
    glm-4-9b-chat-turbomind:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 90.62
    glm-4-9b-chat-vllm:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 90.62
    deepseek-7b-chat-hf:
        gsm8k_accuracy: 46.88
        race-high_accuracy: 81.25
    deepseek-r1-distill-llama-8b-turbomind:
        gsm8k_accuracy: 34.38
        race-high_accuracy: 81.25
    deepseek-r1-distill-qwen-1_5b-turbomind:
        gsm8k_accuracy: 28.12
        race-high_accuracy: 53.12
    deepseek-7b-chat-vllm:
        gsm8k_accuracy: 56.25
        race-high_accuracy: 78.12
    gemma2-2b-it-hf:
        gsm8k_accuracy: 50
        race-high_accuracy: 75
    gemma2-9b-it-hf:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 84.38
    gemma-2b-it-hf:
        gsm8k_accuracy: 3.12
        race-high_accuracy: 40.62
    gemma-7b-it-hf:
        gsm8k_accuracy: 40.62
        race-high_accuracy: 68.75
    gemma-2-9b-it-turbomind:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 84.38
    gemma-2-27b-it-turbomind:
        gsm8k_accuracy: 78.12
        race-high_accuracy: 93.75
    gemma-7b-it-vllm:
        gsm8k_accuracy: 28.12
        race-high_accuracy: 68.75
    internlm2_5-7b-chat-hf:
        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
    internlm3-8b-instruct-hf:
        gsm8k_accuracy: 65.62
        race-high_accuracy: 87.5
    internlm2_5-7b-chat-turbomind:
        gsm8k_accuracy: 81.25
        race-high_accuracy: 90.62
    internlm2-chat-1.8b-turbomind:
        gsm8k_accuracy: 25.00
        race-high_accuracy: 84.38
    internlm2-chat-1.8b-sft-turbomind:
        gsm8k_accuracy: 34.38
        race-high_accuracy: 84.38
    internlm2-chat-7b-lmdeploy:
        gsm8k_accuracy: 59.38
        race-high_accuracy: 87.50
    internlm2-chat-7b-sft-turbomind:
        gsm8k_accuracy: 56.25
        race-high_accuracy: 87.50
    internlm3-8b-instruct-turbomind:
        gsm8k_accuracy: 65.62
        race-high_accuracy: 87.5
    internlm2-chat-7b-vllm:
        gsm8k_accuracy: 53.12
        race-high_accuracy: 87.50
    llama-3_1-8b-instruct-hf:
        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
    llama-3_2-3b-instruct-hf:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 81.25
    llama-3-8b-instruct-hf:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 87.5
    llama-2-7b-chat-turbomind:
        gsm8k_accuracy: 18.75
        race-high_accuracy: 46.88
    llama-3_1-8b-instruct-turbomind:
        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
    llama-3_2-3b-instruct-turbomind:
        gsm8k_accuracy: 65.62
        race-high_accuracy: 81.25
    llama-3-8b-instruct-turbomind:
        gsm8k_accuracy: 65.62
        race-high_accuracy: 84.38
    mistral-7b-instruct-v0.2-hf:
        gsm8k_accuracy: 40.62
        race-high_accuracy: 75
    mistral-7b-instruct-v0.3-hf:
        gsm8k_accuracy: 40.62
        race-high_accuracy: 75
    mistral-nemo-instruct-2407-hf:
        gsm8k_accuracy: 75
        race-high_accuracy: 81.25
    mistral-nemo-instruct-2407-turbomind:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 75
    mistral-7b-instruct-v0.1-vllm:
        gsm8k_accuracy: 34.38
        race-high_accuracy: 65.62
    mistral-7b-instruct-v0.2-vllm:
        gsm8k_accuracy: 28.12
        race-high_accuracy: 78.12
    qwen2.5-0.5b-instruct-hf:
        gsm8k_accuracy: 34.38
        race-high_accuracy: 46.88
    qwen2.5-3b-instruct-hf :
        gsm8k_accuracy: 53.12
        race-high_accuracy: 90.62
    qwen2.5-0.5b-instruct-turbomind:
        gsm8k_accuracy: 28.12
        race-high_accuracy: 43.75
    qwen2.5-3b-instruct-turbomind:
        gsm8k_accuracy: 56.25
        race-high_accuracy: 90.62
    qwen1.5-0.5b-chat-hf:
        gsm8k_accuracy: 0
        race-high_accuracy: 53.12
    qwen2-1.5b-instruct-hf:
        gsm8k_accuracy: 62.5
        race-high_accuracy: 84.38
    qwen2-7b-instruct-hf:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 90.62
    qwen2-1.5b-instruct-turbomind:
        gsm8k_accuracy: 56.25
        race-high_accuracy: 84.38
    qwen2-7b-instruct-turbomind:
        gsm8k_accuracy: 75.00
        race-high_accuracy: 87.50
    qwen1.5-0.5b-chat-vllm:
        gsm8k_accuracy: 6.25
        race-high_accuracy: 53.12
    yi-1.5-6b-chat-hf:
        gsm8k_accuracy: 65.62
        race-high_accuracy: 84.38
    yi-1.5-9b-chat-hf:
        gsm8k_accuracy: 75
        race-high_accuracy: 93.75
    yi-1.5-6b-chat-turbomind:
        gsm8k_accuracy: 59.38
        race-high_accuracy: 84.38
    yi-1.5-9b-chat-turbomind:
        gsm8k_accuracy: 78.12
        race-high_accuracy: 93.75
    deepseek-v2_lite-chat-turbomind:
        gsm8k_accuracy: 43.75
        race-high_accuracy: 71.88
    gemma2-27b-it-hf:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 93.75
    internlm2_5-20b-chat-hf:
        gsm8k_accuracy: 84.38
        race-high_accuracy: 87.5
    internlm2_5-20b-chat-turbomind:
        gsm8k_accuracy: 87.50
        race-high_accuracy: 87.5
    mistral-small-instruct-2409-hf:
        gsm8k_accuracy: 81.25
        race-high_accuracy: 87.50
    mistral-small-instruct-2409-turbomind:
        gsm8k_accuracy: 78.12
        race-high_accuracy: 87.50
    phi-4:
        gsm8k_accuracy: 81.25
        race-high_accuracy: 87.50
    qwen2.5-14b-instruct-hf:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 96.88
    qwen2.5-14b-instruct-turbomind:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 96.88
    yi-1.5-34b-chat-turbomind:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 93.75
    deepseek-67b-chat-turbomind:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 75.00
    deepseek-r1-distill-qwen-32b-turbomind:
        gsm8k_accuracy: 31.25
        race-high_accuracy: 90.62
    llama-3_3-70b-instruct-turbomind:
        gsm8k_accuracy: 93.75
        race-high_accuracy: 87.5
    mixtral-large-instruct-2411-turbomind:
        gsm8k_accuracy: 87.50
        race-high_accuracy: 93.75
    nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
        gsm8k_accuracy: 90.62
        race-high_accuracy: 53.12
    qwen2.5-72b-instruct-turbomind:
        gsm8k_accuracy: 78.12
        race-high_accuracy: 90.62
    deepseek-r1-distill-llama-70b-turbomind:
        gsm8k_accuracy: 50.00
        race-high_accuracy: 87.50
    deepseek-v2_5-1210-turbomind:
        gsm8k_accuracy: 90.62
        race-high_accuracy: 84.38
    mixtral-8x22b-instruct-v0.1-turbomind:
        gsm8k_accuracy: 75.00
        race-high_accuracy: 78.12
    mixtral-8x22b-instruct-v0.1-vllm:
        gsm8k_accuracy: 78.12
        race-high_accuracy: 78.12
 base:
    glm-4-9b-turbomind:
        gsm8k_accuracy: 59.38
        GPQA_diamond_accuracy: 28.12
        race-high_accuracy: 93.75
        winogrande_accuracy: 84.38
    deepseek-7b-base-hf:
        gsm8k_accuracy: 25
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 46.88
        winogrande_accuracy: 71.88
    deepseek-7b-base-turbomind:
        gsm8k_accuracy: 18.75
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 50.00
        winogrande_accuracy: 84.38
    deepseek-moe-16b-base-vllm:
        gsm8k_accuracy: 25.00
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 25
        winogrande_accuracy: 68.75
    gemma2-2b-hf:
        gsm8k_accuracy: 31.25
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 56.25
        winogrande_accuracy: 75.00
    gemma2-9b-hf:
        gsm8k_accuracy: 75.00
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 84.38
        winogrande_accuracy: 81.25
    gemma-2b-hf:
        gsm8k_accuracy: 21.88
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 21.88
        winogrande_accuracy: 53.12
    gemma-7b-hf:
        gsm8k_accuracy: 56.25
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 65.62
        winogrande_accuracy: 71.88
    gemma-2-9b-turbomind:
        gsm8k_accuracy: 68.75
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 84.38
        winogrande_accuracy: 81.25
    gemma-2b-vllm:
        gsm8k_accuracy: 15.62
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 28.12
        winogrande_accuracy: 68.75
    gemma-7b-vllm:
        gsm8k_accuracy: 59.38
        GPQA_diamond_accuracy: 6.25
        race-high_accuracy: 81.25
        winogrande_accuracy: 81.25
    internlm2_5-7b-hf:
        gsm8k_accuracy: 37.5
        GPQA_diamond_accuracy: 25
        race-high_accuracy: 93.75
        winogrande_accuracy: 71.88
    internlm2-7b-hf:
        gsm8k_accuracy: 53.12
        GPQA_diamond_accuracy: 18.75
        race-high_accuracy: 62.5
        winogrande_accuracy: 78.12
    internlm2-1.8b-turbomind:
        gsm8k_accuracy: 12.50
        GPQA_diamond_accuracy: 9.38
        race-high_accuracy: 71.88
        winogrande_accuracy: 75
    internlm2_5-7b-turbomind:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 93.75
        winogrande_accuracy: 87.5
    internlm2-7b-turbomind:
        gsm8k_accuracy: 53.12
        GPQA_diamond_accuracy: 25.00
        race-high_accuracy: 78.12
        winogrande_accuracy: 71.88
    internlm2-base-7b-turbomind:
        gsm8k_accuracy: 25.00
        GPQA_diamond_accuracy: 34.38
        race-high_accuracy: 71.88
        winogrande_accuracy: 62.50
    llama-2-7b-hf:
        gsm8k_accuracy: 21.88
        GPQA_diamond_accuracy: 21.88
        race-high_accuracy: 40.62
        winogrande_accuracy: 71.88
    llama-3_1-8b-hf:
        gsm8k_accuracy: 78.12
        GPQA_diamond_accuracy: 25
        race-high_accuracy: 90.62
        winogrande_accuracy: 62.5
    llama-3-8b-hf:
        gsm8k_accuracy: 46.88
        GPQA_diamond_accuracy: 6.25
        race-high_accuracy: 65.62
        winogrande_accuracy: 65.62
    llama-3.1-8b-turbomind:
        gsm8k_accuracy: 56.25
        GPQA_diamond_accuracy: 9.38
        race-high_accuracy: 78.12
        winogrande_accuracy: 78.12
    llama-3-8b-turbomind:
        gsm8k_accuracy: 46.88
        GPQA_diamond_accuracy: 12.50
        race-high_accuracy: 65.62
        winogrande_accuracy: 81.25
    mistral-7b-v0.3-hf:
        gsm8k_accuracy: 31.25
        GPQA_diamond_accuracy: 6.25
        race-high_accuracy: 62.5
        winogrande_accuracy: 59.38
    qwen2.5-7b-hf:
        gsm8k_accuracy: 81.25
        GPQA_diamond_accuracy: 18.75
        race-high_accuracy: 87.5
        winogrande_accuracy: 71.88
    qwen2.5-1.5b-turbomind:
        gsm8k_accuracy: 59.38
        GPQA_diamond_accuracy: 21.88
        race-high_accuracy: 78.12
        winogrande_accuracy: 71.88
    qwen2.5-7b-turbomind:
        gsm8k_accuracy: 78.12
        GPQA_diamond_accuracy: 21.88
        race-high_accuracy: 87.5
        winogrande_accuracy: 75.00
    qwen1.5-moe-a2.7b-hf:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 18.75
        race-high_accuracy: 84.38
        winogrande_accuracy: 75
    qwen2-0.5b-hf:
        gsm8k_accuracy: 25
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 40.62
        winogrande_accuracy: 62.5
    qwen2-1.5b-hf:
        gsm8k_accuracy: 59.38
        GPQA_diamond_accuracy: 9.38
        race-high_accuracy: 81.25
        winogrande_accuracy: 62.5
    qwen2-7b-hf:
        gsm8k_accuracy: 68.75
        GPQA_diamond_accuracy: 9.38
        race-high_accuracy: 87.5
        winogrande_accuracy: 68.75
    qwen2-1.5b-turbomind:
        gsm8k_accuracy: 56.25
        GPQA_diamond_accuracy: 12.50
        race-high_accuracy: 81.25
        winogrande_accuracy: 75
    qwen2-7b-turbomind:
        gsm8k_accuracy: 65.62
        GPQA_diamond_accuracy: 12.5
        race-high_accuracy: 87.5
        winogrande_accuracy: 75
    qwen1.5-0.5b-vllm:
        gsm8k_accuracy: 9.38
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 56.25
        winogrande_accuracy: 59.38
    yi-1.5-6b-hf:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 87.5
        winogrande_accuracy: 62.5
    yi-1.5-9b-hf:
        gsm8k_accuracy: 75
        GPQA_diamond_accuracy: 40.62
        race-high_accuracy: 87.5
        winogrande_accuracy: 59.38
    yi-1.5-9b-turbomind:
        gsm8k_accuracy: 75.00
        GPQA_diamond_accuracy: 40.62
        race-high_accuracy: 87.5
        winogrande_accuracy: 65.62
    internlm2-20b-turbomind:
        gsm8k_accuracy: 71.88
        GPQA_diamond_accuracy: 18.75
        race-high_accuracy: 68.75
        winogrande_accuracy: 81.25
    qwen2.5-14b-hf:
        gsm8k_accuracy: 75
        GPQA_diamond_accuracy: 37.5
        race-high_accuracy: 93.75
        winogrande_accuracy: 84.38
    qwen2.5-32b-hf:
        gsm8k_accuracy: 87.5
        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 93.75
        winogrande_accuracy: 78.12
    qwen2.5-32b-turbomind:
        gsm8k_accuracy: 90.62
        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 93.75
        winogrande_accuracy: 81.25
    deepseek-67b-base-turbomind:
        gsm8k_accuracy: 62.50
        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 78.12
        winogrande_accuracy: 81.25
    llama-3-70b-turbomind:
        gsm8k_accuracy: 56.25
        GPQA_diamond_accuracy: 15.62
        race-high_accuracy: 93.75
        winogrande_accuracy: 84.38
    qwen2.5-72b-turbomind:
        gsm8k_accuracy: 84.38
        GPQA_diamond_accuracy: 40.62
        race-high_accuracy: 93.75
        winogrande_accuracy: 87.5
    deepseek-v2-turbomind:
        gsm8k_accuracy: 65.62
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 93.75
        winogrande_accuracy: 81.25
--- a/.github/scripts/pr_oc_score_assert.py
+++ b/.github/scripts/pr_oc_score_assert.py
@ -0,0 +1,77 @@
 import csv
 import os
 import pytest
 output_path = 'regression_result'
 model = 'internlm2-chat-7b-hf'
 dataset = 'siqa'
@pytest.fixture()
 def result_scores():
    file = find_csv_files(output_path)
    if file is None:
        return None
    return read_csv_file(file)
@pytest.mark.usefixtures('result_scores')
 class TestChatScore:
    """Test cases for chat model."""
    def test_model_dataset_score(self, result_scores):
        result_score = result_scores.get(model).get(dataset)
        assert_score(result_score, 79.53)
 def assert_score(score, baseline):
    if score is None or score == '-':
        assert False, 'value is none'
    if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97):
        print(score + ' between ' + str(baseline * 0.97) + ' and ' +
              str(baseline * 1.03))
        assert True
    else:
        assert False, score + ' not between ' + str(
            baseline * 0.97) + ' and ' + str(baseline * 1.03)
 def find_csv_files(directory):
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    if len(csv_files) > 1:
        raise 'have more than 1 result file, please check the result manually'
    if len(csv_files) == 0:
        return None
    return csv_files[0]
 def read_csv_file(file_path):
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        filtered_data = []
        for row in reader:
            filtered_row = {
                k: v
                for k, v in row.items()
                if k not in ['version', 'metric', 'mode']
            }
            filtered_data.append(filtered_row)
    result = {}
    for data in filtered_data:
        dataset = data.get('dataset')
        for key in data.keys():
            if key == 'dataset':
                continue
            else:
                if key in result.keys():
                    result.get(key)[dataset] = data.get(key)
                else:
                    result[key] = {dataset: data.get(key)}
    return result
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -0,0 +1,351 @@
 name: daily_run_test
 on:
  workflow_dispatch:
    inputs:
      repo_org:
        required: false
        description: 'Tested repository organization name. Default is open-compass/opencompass'
        type: string
        default: 'open-compass/opencompass'
      repo_ref:
        required: false
        description: 'Set branch or tag or commit id. Default is "main"'
        type: string
        default: 'main'
      build_lmdeploy:
        required: false
        description: 'whether to build lmdeploy'
        type:  boolean
        default: false
      repo_org_lmdeploy:
        required: false
        description: 'Tested repository organization name. Default is internlm/lmdeploy'
        type: string
        default: 'InternLM/lmdeploy'
      repo_ref_lmdeploy:
        required: false
        description: 'Set branch or tag or commit id. Default is "main"'
        type: string
        default: 'main'
      regression_func_volc:
        required: true
        description: 'regression functions'
        type: string
        default: "['chat_models','base_models', 'chat_obj_fullbench', 'base_fullbench']"
      regression_func_local:
        required: true
        description: 'regression functions'
        type: string
        default: "['cmd', 'api', 'chat_sub_fullbench']"
      fullbench_eval:
        required: true
        description: 'fullbench volc functions'
        type: string
        default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
  schedule:
    - cron:  '15 14 * * 0,3'
 env:
  HF_DATASETS_OFFLINE: 1
  HF_EVALUATE_OFFLINE: 1
  TRANSFORMERS_OFFLINE: 1
  VLLM_USE_MODELSCOPE: false
  LMDEPLOY_USE_MODELSCOPE: false
  HF_HUB_OFFLINE: 1
  OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
  CONDA_PATH: ${{ secrets.WORKSPACE_PREFIX }}/miniconda3
  PIP_CACHE_PATH: ${{ secrets.WORKSPACE_PREFIX }}/.cache/pip
  REPORT_ROOT: ${{ secrets.WORKSPACE_PREFIX }}/eval_report/regression
  COMPASS_DATA_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/datasets/compass_data_cache
  HUGGINGFACE_HUB_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/models/opencompass_hf_hub
  HF_HUB_CACHE:  ${{ secrets.SHARESPACE_PREFIX }}/models/opencompass_hf_hub
  HF_DATASETS_CACHE:  ${{ secrets.SHARESPACE_PREFIX }}/datasets/hf_datasets_cache
  HF_ENDPOINT: https://hf-mirror.com
  CONDA_ENV: regression_test
  export VLLM_WORKER_MULTIPROC_METHOD: spawn
 jobs:
  build-pypi:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2
        with:
          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: Set up Python 3.10
        uses: actions/setup-python@v4
        with:
          python-version: '3.10'
      - name: Build lagent
        run: |
          pip install wheel setuptools
          python setup.py sdist bdist_wheel
      - name: Upload Artifacts
        uses: actions/upload-artifact@v4
        with:
          if-no-files-found: error
          path: dist/*
          retention-days: 1
          name: my-artifact-${{ github.run_id }}
  build-pypi-lmdeploy:
    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.build_lmdeploy)}}
    strategy:
      matrix:
        pyver: [py310]
    runs-on: ubuntu-latest
    env:
      PYTHON_VERSION: ${{ matrix.pyver }}
      PLAT_NAME: manylinux2014_x86_64
      DOCKER_TAG: cuda12.1
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
        with:
          repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }}
          ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}}
      - name: Build
        run: |
          echo ${PYTHON_VERSION}
          echo ${PLAT_NAME}
          echo ${DOCKER_TAG}
          echo ${OUTPUT_FOLDER}
          echo ${GITHUB_RUN_ID}
          # remove -it
          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
      - name: Upload Artifacts
        uses: actions/upload-artifact@v4
        with:
          if-no-files-found: error
          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
          retention-days: 1
          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
  prepare_env:
    if: ${{!cancelled()}}
    needs: ['build-pypi', 'build-pypi-lmdeploy']
    runs-on: volc_cu12
    timeout-minutes: 120 #2hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
        with:
          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: Download Artifacts
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}
      - name:  Remove Conda Env
        if: always()
        run: |
          . ${{ secrets.WORKSPACE_PREFIX }}/miniconda3/bin/activate
          conda env remove -y --name ${{env.CONDA_ENV}}
          conda info --envs
      - name: Prepare - create conda env and install torch - cu12
        uses: nick-fields/retry@v3
        with:
          max_attempts: 3
          timeout_minutes: 120
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
            conda create -y --name ${{env.CONDA_ENV}} python=3.10
            conda activate ${{env.CONDA_ENV}}
            pip install -r ${{ secrets.WORKSPACE_PREFIX }}/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install ${{ secrets.WORKSPACE_PREFIX }}/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
            pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
            cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
      - name: Prepare - reinstall lmdeploy - cu12
        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}-py310
      - name: Prepare - reinstall lmdeploy - cu12
        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          pip uninstall -y lmdeploy
          pip install lmdeploy-*.whl --no-deps
      - name: conda env
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          pip list
  daily_run_test_volc:
    if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
    needs: prepare_env
    strategy:
      fail-fast: false
      matrix:
        regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
    runs-on: volc_cu12_daily
    timeout-minutes: 180 #3hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
        with:
          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: conda env
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          pip list
      - name:  modify config
        if: matrix.regression_func != 'chat_sub_fullbench'
        run: |
          cp -r ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/configs_cluster/volc.py .
          cat ${{ secrets.WORKSPACE_PREFIX }}/config/test_config.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
      - name:  Run test
        uses: nick-fields/retry@v3
        with:
          max_attempts: 1
          timeout_minutes: 180
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
            conda activate ${{env.CONDA_ENV}}
            conda info --envs
            opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
            rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
            python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
  daily_run_test_local:
    if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
    needs: prepare_env
    strategy:
      fail-fast: false
      matrix:
        regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
    runs-on: volc_cu12_local
    timeout-minutes: 480 #6hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
        with:
          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: conda env
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          pip list
      - name:  modify config
        if: matrix.regression_func == 'chat_sub_fullbench'
        run: |
          cp -r ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/configs_cluster/volc.py .
          cat ${{ secrets.WORKSPACE_PREFIX }}/config/test_config_sub.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
      - name:  Run command testcase
        if: matrix.regression_func == 'cmd'
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          export from_tf=TRUE
          python tools/list_configs.py internlm2_5 mmlu
          opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
          python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
          opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
          python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
          opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
          python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
          python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
          python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run model test - api
        if: matrix.regression_func == 'api'
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
          echo "restful_pid=$!" >> "$GITHUB_ENV"
          sleep 180s
          env | grep PROXY
          env | grep proxy
          unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
          opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
          python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run model test - api kill
        if: always() && matrix.regression_func == 'api'
        run: |
          kill -15 "$restful_pid"
      - name:  Run testcase
        if: matrix.regression_func == 'chat_sub_fullbench'
        env:
          COMPASS_DATA_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/datasets/compass_data_cache_subset
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          export from_tf=TRUE
          opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
          python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
  fullbench_run_test:
    if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
    needs: prepare_env
    strategy:
      fail-fast: false
      matrix:
        function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
    runs-on: volc_cu12
    timeout-minutes: 480 #6hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
        with:
          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: conda env
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          pip list
      - name:  Run testcase
        uses: nick-fields/retry@v3
        with:
          max_attempts: 1
          timeout_minutes: 480
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
            conda activate ${{env.CONDA_ENV}}
            conda info --envs
            export from_tf=TRUE
            opencompass ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
            rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily
            python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py
  notify_to_feishu:
    if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
    needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
    timeout-minutes: 5
    runs-on: self-hosted
    steps:
      - name: notify
        run: |
          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- Daily test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}'  ${{ secrets.WEBHOOK_URL }}
--- a/.github/workflows/link-check.yml
+++ b/.github/workflows/link-check.yml
@ -0,0 +1,26 @@
 name: 'Link check'
 on:
  schedule:
    # check links at 01:30 a.m. every day
    - cron: '30 1 * * *'
  workflow_dispatch: # allow manual trigger
 jobs:
  link-check:
    runs-on: ubuntu-latest
    steps:
      # - uses: actions/checkout@v3
      - name: Install linkchecker
        run: |
          pip install linkchecker
      - name: Run linkchecker
        run: |
          linkchecker https://opencompass.readthedocs.io/ --no-robots -t 30 --no-warnings \
            --ignore-url "https://opencompass.readthedocs.io/.*/static/images/opencompass_logo.svg" \
            --ignore-url "https://opencompass.readthedocs.io/.*/_static/images/icon-menu-dots.svg" \
            --ignore-url "https://opencompass.readthedocs.io/policy" \
            --ignore-url "https://opencompass.readthedocs.io/(en|zh_CN)/[0-9a-f]{40}/.*"
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -17,7 +17,7 @@ jobs:
          python-version: '3.10'
      - name: Install pre-commit hook
        run: |
-          pip install pre-commit mmengine
+          pip install pre-commit==3.8.0 mmengine==0.10.5
          pre-commit install
      - name: Linting
        run: pre-commit run --all-files
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@ -0,0 +1,106 @@
 name: pr_run_test
 on:
  pull_request:
    paths-ignore:
      - 'README.md'
      - 'README_zh-CN.md'
      - 'docs/**'
      - 'configs/**'
      - 'tools/**'
  workflow_dispatch:
  schedule:
    - cron:  '56 22 * * *'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 env:
  CONDA_ENV: pr_test
  HF_DATASETS_OFFLINE: 1
  HF_EVALUATE_OFFLINE: 1
  TRANSFORMERS_OFFLINE: 1
  VLLM_USE_MODELSCOPE: false
  LMDEPLOY_USE_MODELSCOPE: false
  HF_HUB_OFFLINE: 1
  CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
  PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
  REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/prtest
  COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
  HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
  HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
 jobs:
  pr_run_test:
    runs-on: volc_cu12_local
    environment: 'prod'
    timeout-minutes: 30
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2
      - name: Prepare - Install opencompass
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          python3 -m pip uninstall opencompass -y
          python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}}
          conda info --envs
      - name: conda env
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          pip list
          lmdeploy check_env
      - name:  Run test
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          rm -rf regression_result
          opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1 --debug
          opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2 --debug --max-num-workers 2
          opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3 --debug --max-num-workers 2
      - name:  Get result
        run: |
          score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}')
          if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then
             echo "score is $score between 88 and 89"
          else
             echo "score is $score not between 88 and 89"
             exit 1
          fi
          score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}')
          if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then
             echo "score is $score between 87 and 88"
          else
             echo "score is $score not between 87 and 88"
             exit 1
          fi
          score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}')
          if (( ${score%.*} >= 87 && ${score%.*} <= 91 )); then
             echo "score is $score between 87 and 91"
          else
             echo "score is $score not between 87 and 91"
             exit 1
          fi
      - name:  Uninstall opencompass
        if: always()
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          python3 -m pip uninstall opencompass -y
          conda info --envs
  notify_to_feishu:
    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
    needs: [pr_run_test]
    timeout-minutes: 5
    runs-on: self-hosted
    environment: 'prod'
    steps:
      - name: notify
        run: |
          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}'  ${{ secrets.WEBHOOK_URL }}
--- a/.github/workflows/pr-stage-check.yml
+++ b/.github/workflows/pr-stage-check.yml
@ -0,0 +1,121 @@
 name: pr_stage_test
 on:
  pull_request:
    paths-ignore:
      - 'README.md'
      - 'README_zh-CN.md'
      - 'docs/**'
      - 'configs/**'
      - 'tools/**'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  build:
    runs-on: ubuntu-22.04
    strategy:
      matrix:
        python-version: ['3.10']
        include:
          - torch: 2.5.1
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
      - name: Upgrade pip
        run: python -m pip install --upgrade pip
      - name: Install PyTorch
        run: pip install torch==${{matrix.torch}} -f https://download.pytorch.org/whl/cpu/torch_stable.html
      - name: Install system dependencies
        run: |
          sudo sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
          sudo apt-get update && sudo apt-get install -y libc6 libffi-dev libncursesw6 wget unzip
      - name: Upgrade pip
        run: python -m pip install pip --upgrade
      - name: Install opencompass dependencies
        run: |
          python -m pip install -r requirements.txt
      - name: Build and install
        run: python -m pip install -e .
      - name: Prepare dataset
        run: |
          wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
          unzip OpenCompassData-core-20240207.zip
      - name: Dry run test
        run: |
          python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
  build_cu117:
    runs-on: ubuntu-22.04
    container:
      image: nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
    strategy:
      matrix:
        python-version: ['3.10']
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
      - name: Fetch GPG keys
        run: |
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
      - name: Install Python-dev
        run: apt-get update && apt-get install -y python${{matrix.python-version}}-dev
        if: ${{matrix.python-version != 3.10}}
      - name: Install system dependencies
        run: |
          apt-get update
          apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev libc6 libc6-dev
          sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
          apt-get update && apt-get install -y libc6 libffi-dev libncursesw6 wget unzip
      - name: Upgrade pip
        run: python -m pip install pip --upgrade
      - name: Install opencompass dependencies
        run: |
          python -m pip install -r requirements.txt
      - name: Build and install
        run: python -m pip install -e .
      - name: Prepare dataset
        run: |
          wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
          unzip OpenCompassData-core-20240207.zip
      - name: Dry run test
        run: |
          python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
  build_windows:
    runs-on: windows-2022
    strategy:
      matrix:
        python-version: ['3.10']
        platform: [cpu]
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
      - name: Upgrade pip
        run: python -m pip install pip --upgrade
      - name: Install PyTorch
        run: pip install torch==2.5.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html
      - name: Install opencompass dependencies
        run: |
          pip install -r requirements.txt
      - name: Build and install
        run: pip install -e .
      - name: Prepare dataset
        run: |
          Invoke-WebRequest -Uri https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip -OutFile OpenCompassData-core-20240207.zip
          unzip OpenCompassData-core-20240207.zip
      - name: Dry run test
        run: |
          python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@ -1,21 +1,26 @@
 name: deploy
-on: push
+on:
-
+  push:
-concurrency:
+  workflow_dispatch:
-  group: ${{ github.workflow }}-${{ github.ref }}
+    inputs:
-  cancel-in-progress: true
+      confirm_publish:
        description: 'Type YES to confirm publishing to PyPI'
        required: true
        type: string
 jobs:
  build-n-publish:
    runs-on: ubuntu-latest
-    if: startsWith(github.event.ref, 'refs/tags')
+    if: |
      github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') ||
      (github.event_name == 'workflow_dispatch' && inputs.confirm_publish == 'YES')
    steps:
      - uses: actions/checkout@v2
-      - name: Set up Python 3.7
+      - name: Set up Python 3.10
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v4
        with:
-          python-version: 3.7
+          python-version: '3.10'
      - name: Build lagent
        run: |
          pip install wheel
--- a/.gitignore
+++ b/.gitignore
@ -1,17 +1,23 @@
-
+.DS_Store
 output_*/
 outputs/
 scripts/
 icl_inference_output/
 .vscode/
 tmp/
 configs/eval_subjective_alignbench_test.py
 configs/openai_key.py
 configs/secrets.py
 configs/datasets/log.json
 configs/eval_debug*.py
 configs/viz_*.py
 configs/**/*_bkup.py
 opencompass/**/*_bkup.py
 data
 work_dirs
-
+outputs
 models/*
 configs/internal/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@ -86,3 +92,44 @@ docs/zh_cn/_build/
 # .zip
 *.zip
 # sft config ignore list
 configs/sft_cfg/*B_*
 configs/sft_cfg/1B/*
 configs/sft_cfg/7B/*
 configs/sft_cfg/20B/*
 configs/sft_cfg/60B/*
 configs/sft_cfg/100B/*
 configs/cky/
 configs/_internal_legacy*
 # in case llama clone in the opencompass
 llama/
 # in case ilagent clone in the opencompass
 ilagent/
 # ignore the config file for criticbench evaluation
 configs/sft_cfg/criticbench_eval/*
 # path of turbomind's model after runing `lmdeploy.serve.turbomind.deploy`
 turbomind/
 # cibench output
 *.db
 *.pth
 *.pt
 *.onnx
 *.gz
 *.gz.*
 *.png
 *.txt
 *.jpg
 *.json
 *.jsonl
 *.csv
 *.npy
 *.c
 # aliyun
 core.*
--- a/.owners.yml
+++ b/.owners.yml
@ -7,8 +7,8 @@ assign:
  scedule:
    '*/1 * * * *'
  assignees:
-    - Leymore
+    - bittersweet1999
-    - gaotongxiao
+    - liushz
-    - yingfhu
+    - MaiziXiao
-    - fangyixiao18
+    - acylam
    - tonysy
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
@ -1,26 +1,50 @@
 exclude: |
    (?x)^(
      tests/data/|
      tests/dataset/|
      opencompass/models/internal/|
      opencompass/utils/internal/|
-      opencompass/openicl/icl_evaluator/hf_metrics/
+      opencompass/openicl/icl_evaluator/hf_metrics/|
      opencompass/datasets/lawbench/utils|
      opencompass/datasets/lawbench/evaluation_functions/|
      opencompass/datasets/medbench/|
      opencompass/datasets/teval/|
      opencompass/datasets/NPHardEval/|
      opencompass/datasets/TheoremQA|
      opencompass/datasets/subjective/mtbench101.py|
      docs/zh_cn/advanced_guides/compassbench_intro.md |
      docs/zh_cn/advanced_guides/compassbench_v2_0.md |
      opencompass/utils/datasets.py |
      opencompass/utils/datasets_info.py
    )
 repos:
  - repo: https://gitee.com/openmmlab/mirrors-flake8
    rev: 5.0.4
    hooks:
      - id: flake8
-        exclude: configs/
+        exclude: |
            (?x)^(
                opencompass/configs/|
                examples/
            )
  - repo: https://gitee.com/openmmlab/mirrors-isort
    rev: 5.11.5
    hooks:
      - id: isort
-        exclude: configs/
+        exclude: |
            (?x)^(
                opencompass/configs/|
                examples/
            )
  - repo: https://gitee.com/openmmlab/mirrors-yapf
    rev: v0.32.0
    hooks:
      - id: yapf
-        exclude: configs/
+        exclude: |
            (?x)^(
                opencompass/configs/|
                examples/
            )
  - repo: https://gitee.com/openmmlab/mirrors-codespell
    rev: v2.2.1
    hooks:
@ -28,7 +52,9 @@ repos:
        exclude: |
            (?x)^(
                .*\.jsonl|
-                configs/
+                .*\.md.template|
                opencompass/configs/ |
                examples/
            )
  - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
    rev: v4.3.0
@ -38,7 +64,6 @@ repos:
            (?x)^(
              dicts/|
              projects/.*?/dicts/|
              configs/
            )
      - id: check-yaml
      - id: end-of-file-fixer
@ -46,18 +71,14 @@ repos:
            (?x)^(
              dicts/|
              projects/.*?/dicts/|
              configs/
            )
      - id: requirements-txt-fixer
      - id: double-quote-string-fixer
        exclude: configs/
      - id: check-merge-conflict
      - id: fix-encoding-pragma
        args: ["--remove"]
      - id: mixed-line-ending
        args: ["--fix=lf"]
      - id: mixed-line-ending
        args: ["--fix=lf"]
  - repo: https://gitee.com/openmmlab/mirrors-mdformat
    rev: 0.7.9
    hooks:
@ -81,7 +102,25 @@ repos:
        language: script
        pass_filenames: true
        require_serial: true
-        files: ^configs/datasets
+        files: ^opencompass/configs/datasets
  - repo: local
    hooks:
    -   id: update-dataset-suffix-pacakge
        name: dataset suffix updater(package)
        entry: ./tools/update_dataset_suffix.py
        language: script
        pass_filenames: false
        # require_serial: true
        # files: ^opencompass/configs/datasets
        args:
          - --root_folder
          - opencompass/configs/datasets
  - repo: https://gitee.com/mirrors/gitleaks
    rev: v8.23.1
    hooks:
    -   id: gitleaks
        entry: "gitleaks dir"
        args: ["--verbose", "--redact=50"]
  # - repo: https://github.com/open-mmlab/pre-commit-hooks
  #   rev: v0.2.0  # Use the ref you want to point at
  #   hooks:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,26 +1,51 @@
 exclude: |
    (?x)^(
      tests/data/|
      tests/dataset/|
      opencompass/models/internal/|
      opencompass/utils/internal/|
-      opencompass/openicl/icl_evaluator/hf_metrics/
+      opencompass/openicl/icl_evaluator/hf_metrics/|
      opencompass/datasets/lawbench/utils|
      opencompass/datasets/lawbench/evaluation_functions/|
      opencompass/datasets/medbench/|
      opencompass/datasets/matbench/|
      opencompass/datasets/teval/|
      opencompass/datasets/NPHardEval/|
      opencompass/datasets/TheoremQA|
      opencompass/datasets/subjective/mtbench101.py|
      docs/zh_cn/advanced_guides/compassbench_intro.md |
      docs/zh_cn/advanced_guides/compassbench_v2_0.md |
      opencompass/utils/datasets.py |
      opencompass/utils/datasets_info.py
    )
 repos:
  - repo: https://github.com/PyCQA/flake8
    rev: 5.0.4
    hooks:
      - id: flake8
-        exclude: configs/
+        exclude: |
            (?x)^(
                opencompass/configs/|
                examples/
            )
  - repo: https://github.com/PyCQA/isort
    rev: 5.11.5
    hooks:
      - id: isort
-        exclude: configs/
+        exclude: |
            (?x)^(
                opencompass/configs/|
                examples/
            )
  - repo: https://github.com/pre-commit/mirrors-yapf
    rev: v0.32.0
    hooks:
      - id: yapf
-        exclude: configs/
+        exclude: |
            (?x)^(
                opencompass/configs/|
                examples/
            )
  - repo: https://github.com/codespell-project/codespell
    rev: v2.2.1
    hooks:
@ -28,7 +53,9 @@ repos:
        exclude: |
            (?x)^(
                .*\.jsonl|
-                configs/
+                .*\.md.template|
                opencompass/configs/ |
                examples/
            )
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.3.0
@ -38,7 +65,6 @@ repos:
            (?x)^(
              dicts/|
              projects/.*?/dicts/|
              configs/
            )
      - id: check-yaml
      - id: end-of-file-fixer
@ -46,18 +72,14 @@ repos:
            (?x)^(
              dicts/|
              projects/.*?/dicts/|
              configs/
            )
      - id: requirements-txt-fixer
      - id: double-quote-string-fixer
        exclude: configs/
      - id: check-merge-conflict
      - id: fix-encoding-pragma
        args: ["--remove"]
      - id: mixed-line-ending
        args: ["--fix=lf"]
      - id: mixed-line-ending
        args: ["--fix=lf"]
  - repo: https://github.com/executablebooks/mdformat
    rev: 0.7.9
    hooks:
@ -81,7 +103,25 @@ repos:
        language: script
        pass_filenames: true
        require_serial: true
-        files: ^configs/datasets
+        files: ^opencompass/configs/datasets
  - repo: local
    hooks:
    -   id: update-dataset-suffix-pacakge
        name: dataset suffix updater(package)
        entry: ./tools/update_dataset_suffix.py
        language: script
        pass_filenames: false
        # require_serial: true
        # files: ^opencompass/configs/datasets
        args:
          - --root_folder
          - opencompass/configs/datasets
  - repo: https://github.com/gitleaks/gitleaks
    rev: v8.23.1
    hooks:
    -   id: gitleaks
        entry: "gitleaks dir"
        args: ["--verbose", "--redact=50"]
  # - repo: https://github.com/open-mmlab/pre-commit-hooks
  #   rev: v0.2.0  # Use the ref you want to point at
  #   hooks:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,3 @@
 recursive-include opencompass/configs *.py *.yml *.json *.txt *.md
 recursive-include opencompass/openicl/icl_evaluator/hf_metrics *.py
 recursive-include opencompass/datasets *.py *.yml *.json *.txt *.md *.yaml
--- a/README.md
+++ b/README.md
@ -3,256 +3,307 @@
  <br />
  <br />
-[![docs](https://readthedocs.org/projects/opencompass/badge)](https://opencompass.readthedocs.io/en)
+[![][github-release-shield]][github-release-link]
-[![license](https://img.shields.io/github/license/InternLM/opencompass.svg)](https://github.com/open-compass/opencompass/blob/main/LICENSE)
+[![][github-releasedate-shield]][github-releasedate-link]
 [![][github-contributors-shield]][github-contributors-link]<br>
 [![][github-forks-shield]][github-forks-link]
 [![][github-stars-shield]][github-stars-link]
 [![][github-issues-shield]][github-issues-link]
 [![][github-license-shield]][github-license-link]
 <!-- [![PyPI](https://badge.fury.io/py/opencompass.svg)](https://pypi.org/project/opencompass/) -->
 [🌐Website](https://opencompass.org.cn/) |
 [📖CompassHub](https://hub.opencompass.org.cn/home) |
 [📊CompassRank](https://rank.opencompass.org.cn/home) |
 [📘Documentation](https://opencompass.readthedocs.io/en/latest/) |
-[🛠️Installation](https://opencompass.readthedocs.io/en/latest/get_started.html#installation) |
+[🛠️Installation](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) |
 [🤔Reporting Issues](https://github.com/open-compass/opencompass/issues/new/choose)
 English | [简体中文](README_zh-CN.md)
 [![][github-trending-shield]][github-trending-url]
 </div>
 <p align="center">
    👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
 </p>
 > \[!IMPORTANT\]
 >
 > **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️
 <details>
  <summary><kbd>Star History</kbd></summary>
  <picture>
    <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
    <img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
  </picture>
 </details>
 ## 🧭	Welcome
 to **OpenCompass**!
 Just like a compass guides us on our journey, OpenCompass will guide you through the complex landscape of evaluating large language models. With its powerful algorithms and intuitive interface, OpenCompass makes it easy to assess the quality and effectiveness of your NLP models.
-> **🔥 Attention**<br />
+🚩🚩🚩 Explore opportunities at OpenCompass! We're currently **hiring full-time researchers/engineers and interns**. If you're passionate about LLM and OpenCompass, don't hesitate to reach out to us via [email](mailto:zhangsongyang@pjlab.org.cn). We'd love to hear from you!
-> We launch the OpenCompass Collabration project, welcome to support diverse evaluation benchmarks into OpenCompass!
+
-> Clike [Issue](https://github.com/open-compass/opencompass/issues/248) for more information.
+🔥🔥🔥 We are delighted to announce that **the OpenCompass has been recommended by the Meta AI**, click [Get Started](https://ai.meta.com/llama/get-started/#validation) of Llama for more information.
-> Let's work together to build a more powerful OpenCompass toolkit!
+
 > **Attention**<br />
 > Breaking Change Notice: In version 0.4.0, we are consolidating all AMOTIC configuration files (previously located in ./configs/datasets, ./configs/models, and ./configs/summarizers) into the opencompass package. Users are advised to update their configuration references to reflect this structural change.
 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2023.09.20\]** We update the leaderboard with [InternLM-20B](https://github.com/InternLM/InternLM), welcome to our [homepage](https://opencompass.org.cn) for more details. 🔥🔥🔥.
+- **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
- **\[2023.09.19\]** We update the leaderboard with WeMix-LLaMA2-70B/Phi-1.5-1.3B, welcome to our [homepage](https://opencompass.org.cn) for more details. 🔥🔥🔥.
+- **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
- **\[2023.09.18\]** We have released [long context evaluation guidance](docs/en/advanced_guides/longeval.md). 🔥🔥🔥.
+- **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
- **\[2023.09.08\]** We update the leaderboard with Baichuan-2/Tigerbot-2/Vicuna-v1.5, welcome to our [homepage](https://opencompass.org.cn) for more details.
+- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHVerifyEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
- **\[2023.09.06\]**  [**Baichuan2**](https://github.com/baichuan-inc/Baichuan2) team adpots OpenCompass to evaluate their models systematically. We deeply appreciate the community's dedication to transparency and reproducibility in LLM evaluation.
+- **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
- **\[2023.09.02\]** We have supported the evaluation of [Qwen-VL](https://github.com/QwenLM/Qwen-VL) in OpenCompass.
+- **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
- **\[2023.08.25\]**  [**TigerBot**](https://github.com/TigerResearch/TigerBot) team adpots OpenCompass to evaluate their models systematically. We deeply appreciate the community's dedication to transparency and reproducibility in LLM evaluation.
+- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
- **\[2023.08.21\]** [**Lagent**](https://github.com/InternLM/lagent) has been released, which is a lightweight framework for building LLM-based agents. We are working with Lagent team to support the evaluation of general tool-use capability, stay tuned!
+- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](examples/eval_babilong.py) and give it a try! 🔥🔥🔥
 - **\[2024.10.14\]** We now support the OpenAI multilingual QA dataset [MMMLU](https://huggingface.co/datasets/openai/MMMLU). Feel free to give it a try! 🔥🔥🔥
 - **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥
 - **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥
 - **\[2024.09.05\]** We now support answer extraction through model post-processing to provide a more accurate representation of the model's capabilities. As part of this update, we have integrated [XFinder](https://github.com/IAAR-Shanghai/xFinder) as our first post-processing model. For more detailed information, please refer to the [documentation](opencompass/utils/postprocessors/xfinder/README.md), and give it a try! 🔥🔥🔥
 - **\[2024.08.20\]** OpenCompass now supports the [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists. 🔥🔥🔥
 - **\[2024.08.16\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations. Check out the [RULER](configs/datasets/ruler/README.md) evaluation config now! 🔥🔥🔥
 - **\[2024.08.09\]** We have released the example data and configuration for the CompassBench-202408, welcome to [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) for more details. 🔥🔥🔥
 - **\[2024.08.01\]** We supported the [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315) models. Welcome to try! 🔥🔥🔥
 - **\[2024.07.23\]** We supported the [ModelScope](www.modelscope.cn) datasets, you can load them on demand without downloading all the data to your local disk. Welcome to try! 🔥🔥🔥
 - **\[2024.07.17\]** We are excited to announce the release of NeedleBench's [technical report](http://arxiv.org/abs/2407.11963). We invite you to visit our [support documentation](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html) for detailed evaluation guidelines. 🔥🔥🔥
 - **\[2024.07.04\]** OpenCompass now supports InternLM2.5, which has **outstanding reasoning capability**, **1M Context window and** and **stronger tool use**, you can try the models in [OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) and [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
 - **\[2024.06.20\]** OpenCompass now supports one-click switching between inference acceleration backends, enhancing the efficiency of the evaluation process. In addition to the default HuggingFace inference backend, it now also supports popular backends [LMDeploy](https://github.com/InternLM/lmdeploy) and [vLLM](https://github.com/vllm-project/vllm). This feature is available via a simple command-line switch and through deployment APIs. For detailed usage, see the [documentation](docs/en/advanced_guides/accelerator_intro.md).🔥🔥🔥.
 > [More](docs/en/notes/news.md)
 ## 📊 Leaderboard
 We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
 You can also refer to [CompassAcademic](configs/eval_academic_leaderboard_202412.py) to quickly reproduce the leaderboard results. The currently selected datasets include Knowledge Reasoning (MMLU-Pro/GPQA Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Generation (LiveCodeBench, HumanEval), and Instruction Following (IFEval)."
 <p align="right"><a href="#top">🔝Back to top</a></p>
 ## 🛠️ Installation
 Below are the steps for quick installation and datasets preparation.
 ### 💻 Environment Setup
 We highly recommend using conda to manage your python environment.
 - #### Create your virtual environment
  ```bash
  conda create --name opencompass python=3.10 -y
  conda activate opencompass
  ```
 - #### Install OpenCompass via pip
  ```bash
    pip install -U opencompass
    ## Full installation (with support for more datasets)
    # pip install "opencompass[full]"
    ## Environment with model acceleration frameworks
    ## Manage different acceleration frameworks using virtual environments
    ## since they usually have dependency conflicts with each other.
    # pip install "opencompass[lmdeploy]"
    # pip install "opencompass[vllm]"
    ## API evaluation (i.e. Openai, Qwen)
    # pip install "opencompass[api]"
  ```
 - #### Install OpenCompass from source
  If you want to use opencompass's latest features, or develop new features, you can also build it from source
  ```bash
    git clone https://github.com/open-compass/opencompass opencompass
    cd opencompass
    pip install -e .
    # pip install -e ".[full]"
    # pip install -e ".[vllm]"
  ```
 ### 📂 Data Preparation
 You can choose one for the following method to prepare datasets.
 #### Offline Preparation
 You can download and extract the datasets with the following commands:
 ```bash
 # Download dataset to data/ folder
 wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
 unzip OpenCompassData-core-20240207.zip
 ```
 #### Automatic Download from OpenCompass
 We have supported download datasets automatic from the OpenCompass storage server. You can run the evaluation with extra `--dry-run` to download these datasets.
 Currently, the supported datasets are listed in [here](https://github.com/open-compass/opencompass/blob/main/opencompass/utils/datasets_info.py#L259). More datasets will be uploaded recently.
 #### (Optional) Automatic Download with ModelScope
 Also you can use the [ModelScope](www.modelscope.cn) to load the datasets on demand.
 Installation:
 ```bash
 pip install modelscope[framework]
 export DATASET_SOURCE=ModelScope
 ```
 Then submit the evaluation task without downloading all the data to your local disk. Available datasets include:
 ```bash
 humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli
 ```
 Some third-party features, like Humaneval and Llama, may require additional steps to work properly, for detailed steps please refer to the [Installation Guide](https://opencompass.readthedocs.io/en/latest/get_started/installation.html).
 <p align="right"><a href="#top">🔝Back to top</a></p>
 ## 🏗️ ️Evaluation
 After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!
 ### Your first evaluation with OpenCompass!
 OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
 ```bash
 # CLI
 opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
 # Python scripts
 opencompass examples/eval_chat_demo.py
 ```
 You can find more script examples under [examples](./examples) folder.
 ### API evaluation
 OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
 ```bash
 export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
 # CLI
 opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
 # Python scripts
 opencompass examples/eval_api_demo.py
 # You can use o1_mini_2024_09_12/o1_preview_2024_09_12  for o1 models, we set max_completion_tokens=8192 as default.
 ```
 ### Accelerated Evaluation
 Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
 ```bash
 # CLI
 opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
 # Python scripts
 opencompass examples/eval_lmdeploy_demo.py
 ```
 ### Supported Models and Datasets
 OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
 ```bash
 # List all configurations
 python tools/list_configs.py
 # List all configurations related to llama and mmlu
 python tools/list_configs.py llama mmlu
 ```
 #### Supported Models
 If the model is not on the list but supported by Huggingface AutoModel class or encapsulation of inference engine based on OpenAI interface (see [docs](https://opencompass.readthedocs.io/en/latest/advanced_guides/new_model.html) for details), you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
 ```bash
 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
 ```
 #### Supported Datasets
 Currently, OpenCompass have provided standard recommended configurations for datasets. Generally, config files ending with `_gen.py` or `_llm_judge_gen.py` will point to the recommended config we provide for this dataset. You can refer to [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for more details.
 ```bash
 # Recommended Evaluation Config based on Rules
 opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
 # Recommended Evaluation Config based on LLM Judge
 opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
 ```
 If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
 ```bash
 CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
 ```
 > \[!TIP\]
 >
 > `--hf-num-gpus` is used for model parallel(huggingface format), `--max-num-worker` is used for data parallel.
 > \[!TIP\]
 >
 > configuration with `_ppl` is designed for base model typically.
 > configuration with `_gen` can be used for both base model and chat model.
 Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
 <p align="right"><a href="#top">🔝Back to top</a></p>
 ## 📣 OpenCompass 2.0
 We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home).
 ![oc20](https://github.com/tonysy/opencompass/assets/7881589/90dbe1c0-c323-470a-991e-2b37ab5350b2)
 **CompassRank** has been significantly enhanced into the leaderboards that now incorporates both open-source benchmarks and proprietary benchmarks. This upgrade allows for a more comprehensive evaluation of models across the industry.
 **CompassHub** presents a pioneering benchmark browser interface, designed to simplify and expedite the exploration and utilization of an extensive array of benchmarks for researchers and practitioners alike. To enhance the visibility of your own benchmark within the community, we warmly invite you to contribute it to CompassHub. You may initiate the submission process by clicking [here](https://hub.opencompass.org.cn/dataset-submit).
 **CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products.
 ## ✨ Introduction
 ![image](https://github.com/open-compass/opencompass/assets/22607038/f45fe125-4aed-4f8c-8fe8-df4efb41a8ea)
-OpenCompass is a one-stop platform for large model evaluation, aiming to provide a fair, open, and reproducible benchmark for large model evaluation. Its main features includes:
+OpenCompass is a one-stop platform for large model evaluation, aiming to provide a fair, open, and reproducible benchmark for large model evaluation. Its main features include:
- **Comprehensive support for models and datasets**: Pre-support for 20+ HuggingFace and API models, a model evaluation scheme of 50+ datasets with about 300,000 questions, comprehensively evaluating the capabilities of the models in five dimensions.
+- **Comprehensive support for models and datasets**: Pre-support for 20+ HuggingFace and API models, a model evaluation scheme of 70+ datasets with about 400,000 questions, comprehensively evaluating the capabilities of the models in five dimensions.
 - **Efficient distributed evaluation**: One line command to implement task division and distributed evaluation, completing the full evaluation of billion-scale models in just a few hours.
- **Diversified evaluation paradigms**: Support for zero-shot, few-shot, and chain-of-thought evaluations, combined with standard or dialogue type prompt templates, to easily stimulate the maximum performance of various models.
+- **Diversified evaluation paradigms**: Support for zero-shot, few-shot, and chain-of-thought evaluations, combined with standard or dialogue-type prompt templates, to easily stimulate the maximum performance of various models.
 - **Modular design with high extensibility**: Want to add new models or datasets, customize an advanced task division strategy, or even support a new cluster management system? Everything about OpenCompass can be easily expanded!
- **Experiment management and reporting mechanism**: Use config files to fully record each experiment, support real-time reporting of results.
+- **Experiment management and reporting mechanism**: Use config files to fully record each experiment, and support real-time reporting of results.
 ## 📊 Leaderboard
 We provide [OpenCompass Leaderbaord](https://opencompass.org.cn/rank) for community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
 <p align="right"><a href="#top">🔝Back to top</a></p>
 ## 📖 Dataset Support
-<table align="center">
+We have supported a statistical list of all datasets that can be used on this platform in the documentation on the OpenCompass website.
  <tbody>
    <tr align="center" valign="bottom">
      <td>
        <b>Language</b>
      </td>
      <td>
        <b>Knowledge</b>
      </td>
      <td>
        <b>Reasoning</b>
      </td>
      <td>
        <b>Comprehensive Examination</b>
      </td>
      <td>
        <b>Understanding</b>
      </td>
    </tr>
    <tr valign="top">
      <td>
 <details open>
 <summary><b>Word Definition</b></summary>
- WiC
+You can quickly find the dataset you need from the list through sorting, filtering, and searching functions.
 - SummEdits
-</details>
+In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.
-<details open>
+Please refer to the dataset statistics chapter of [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for details.
 <summary><b>Idiom Learning</b></summary>
 - CHID
 </details>
 <details open>
 <summary><b>Semantic Similarity</b></summary>
 - AFQMC
 - BUSTM
 </details>
 <details open>
 <summary><b>Coreference Resolution</b></summary>
 - CLUEWSC
 - WSC
 - WinoGrande
 </details>
 <details open>
 <summary><b>Translation</b></summary>
 - Flores
 </details>
      </td>
      <td>
 <details open>
 <summary><b>Knowledge Question Answering</b></summary>
 - BoolQ
 - CommonSenseQA
 - NaturalQuestion
 - TrivialQA
 </details>
 <details open>
 <summary><b>Multi-language Question Answering</b></summary>
 - TyDi-QA
 </details>
      </td>
      <td>
 <details open>
 <summary><b>Textual Entailment</b></summary>
 - CMNLI
 - OCNLI
 - OCNLI_FC
 - AX-b
 - AX-g
 - CB
 - RTE
 </details>
 <details open>
 <summary><b>Commonsense Reasoning</b></summary>
 - StoryCloze
 - StoryCloze-CN (coming soon)
 - COPA
 - ReCoRD
 - HellaSwag
 - PIQA
 - SIQA
 </details>
 <details open>
 <summary><b>Mathematical Reasoning</b></summary>
 - MATH
 - GSM8K
 </details>
 <details open>
 <summary><b>Theorem Application</b></summary>
 - TheoremQA
 </details>
 <details open>
 <summary><b>Code</b></summary>
 - HumanEval
 - MBPP
 </details>
 <details open>
 <summary><b>Comprehensive Reasoning</b></summary>
 - BBH
 </details>
      </td>
      <td>
 <details open>
 <summary><b>Junior High, High School, University, Professional Examinations</b></summary>
 - GAOKAO-2023
 - CEval
 - AGIEval
 - MMLU
 - GAOKAO-Bench
 - CMMLU
 - ARC
 </details>
      </td>
      <td>
 <details open>
 <summary><b>Reading Comprehension</b></summary>
 - C3
 - CMRC
 - DRCD
 - MultiRC
 - RACE
 </details>
 <details open>
 <summary><b>Content Summary</b></summary>
 - CSL
 - LCSTS
 - XSum
 </details>
 <details open>
 <summary><b>Content Analysis</b></summary>
 - EPRSTMT
 - LAMBADA
 - TNEWS
 </details>
      </td>
    </tr>
 </td>
    </tr>
  </tbody>
 </table>
 <p align="right"><a href="#top">🔝Back to top</a></p>
@ -274,112 +325,81 @@ We provide [OpenCompass Leaderbaord](https://opencompass.org.cn/rank) for commun
    <tr valign="top">
      <td>
- InternLM
+- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
- LLaMA
+- [Baichuan](https://github.com/baichuan-inc)
- Vicuna
+- [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
- Alpaca
+- [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)
- Baichuan
+- [ChatGLM3](https://github.com/THUDM/ChatGLM3-6B)
- WizardLM
+- [Gemma](https://huggingface.co/google/gemma-7b)
- ChatGLM-6B
+- [InternLM](https://github.com/InternLM/InternLM)
- ChatGLM2-6B
+- [LLaMA](https://github.com/facebookresearch/llama)
- MPT
+- [LLaMA3](https://github.com/meta-llama/llama3)
- Falcon
+- [Qwen](https://github.com/QwenLM/Qwen)
- TigerBot
+- [TigerBot](https://github.com/TigerResearch/TigerBot)
- MOSS
+- [Vicuna](https://github.com/lm-sys/FastChat)
- ...
+- [WizardLM](https://github.com/nlpxucan/WizardLM)
 - [Yi](https://github.com/01-ai/Yi)
 - ……
 </td>
 <td>
 - OpenAI
- Claude (coming soon)
+- Gemini
- PaLM (coming soon)
+- Claude
 - ZhipuAI(ChatGLM)
 - Baichuan
 - ByteDance(YunQue)
 - Huawei(PanGu)
 - 360
 - Baidu(ERNIEBot)
 - MiniMax(ABAB-Chat)
 - SenseTime(nova)
 - Xunfei(Spark)
 - ……
 </td>
 <!--
 - GLM
 - ...
 </td> -->
 </tr>
  </tbody>
 </table>
 ## 🛠️ Installation
 Below are the steps for quick installation and datasets preparation.
 ```Python
 conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
 conda activate opencompass
 git clone https://github.com/open-compass/opencompass opencompass
 cd opencompass
 pip install -e .
 # Download dataset to data/ folder
 wget https://github.com/open-compass/opencompass/releases/download/0.1.1/OpenCompassData.zip
 unzip OpenCompassData.zip
 ```
 Some third-party features, like Humaneval and Llama, may require additional steps to work properly, for detailed steps please refer to the [Installation Guide](https://opencompass.readthedocs.io/en/latest/get_started.html).
 <p align="right"><a href="#top">🔝Back to top</a></p>
 ## 🏗️ ️Evaluation
 After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared, you can evaluate the performance of the LLaMA-7b model on the MMLU and C-Eval datasets using the following command:
 ```bash
 python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
 ```
 OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
 ```bash
 # List all configurations
 python tools/list_configs.py
 # List all configurations related to llama and mmlu
 python tools/list_configs.py llama mmlu
 ```
 You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
 ```bash
 python run.py --datasets ceval_ppl mmlu_ppl \
 --hf-path huggyllama/llama-7b \  # HuggingFace model path
 --model-kwargs device_map='auto' \  # Arguments for model construction
 --tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \  # Arguments for tokenizer construction
 --max-out-len 100 \  # Maximum number of tokens generated
 --max-seq-len 2048 \  # Maximum sequence length the model can accept
 --batch-size 8 \  # Batch size
 --no-batch-padding \  # Don't enable batch padding, infer through for loop to avoid performance loss
 --num-gpus 1  # Number of required GPUs
 ```
 Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started.html) to learn how to run an evaluation task.
 ## 🔜 Roadmap
- [ ] Subjective Evaluation
+- [x] Subjective Evaluation
-  - [ ] Release CompassAreana
+  - [x] Release CompassAreana.
-  - [ ] Subjective evaluation dataset.
+  - [x] Subjective evaluation.
- [ ] Long-context
+- [x] Long-context
-  - [ ] Long-context evaluation with extensive datasets.
+  - [x] Long-context evaluation with extensive datasets.
  - [ ] Long-context leaderboard.
- [ ] Coding
+- [x] Coding
-  - [ ] Coding evaluation leaderdboard.
+  - [ ] Coding evaluation leaderboard.
-  - [ ] Non-python language evaluation service.
+  - [x] Non-python language evaluation service.
- [ ] Agent
+- [x] Agent
-  - [ ] Support various agenet framework.
+  - [ ] Support various agent frameworks.
-  - [ ] Evaluation of tool use of the LLMs.
+  - [x] Evaluation of tool use of the LLMs.
- [ ] Robustness
+- [x] Robustness
-  - [ ] Support various attack method
+  - [x] Support various attack methods.
 ## 👷‍♂️ Contributing
-We appreciate all contributions to improve OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
+We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
 <!-- Copy-paste in your Readme.md file -->
 <!-- Made with [OSS Insight](https://ossinsight.io/) -->
 <a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
  <table>
    <tr>
      <th colspan="2">
        <br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
      </th>
    </tr>
  </table>
 </a>
 ## 🤝 Acknowledgements
@ -399,3 +419,20 @@ Some datasets and prompt implementations are modified from [chain-of-thought-hub
 ```
 <p align="right"><a href="#top">🔝Back to top</a></p>
 [github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
 [github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
 [github-forks-link]: https://github.com/open-compass/opencompass/network/members
 [github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
 [github-issues-link]: https://github.com/open-compass/opencompass/issues
 [github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
 [github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
 [github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
 [github-release-link]: https://github.com/open-compass/opencompass/releases
 [github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
 [github-releasedate-link]: https://github.com/open-compass/opencompass/releases
 [github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
 [github-stars-link]: https://github.com/open-compass/opencompass/stargazers
 [github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
 [github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
 [github-trending-url]: https://trendshift.io/repositories/6630
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -3,48 +3,275 @@
  <br />
  <br />
-[![docs](https://readthedocs.org/projects/opencompass/badge)](https://opencompass.readthedocs.io/zh_CN)
+[![][github-release-shield]][github-release-link]
-[![license](https://img.shields.io/github/license/InternLM/opencompass.svg)](https://github.com/open-compass/opencompass/blob/main/LICENSE)
+[![][github-releasedate-shield]][github-releasedate-link]
 [![][github-contributors-shield]][github-contributors-link]<br>
 [![][github-forks-shield]][github-forks-link]
 [![][github-stars-shield]][github-stars-link]
 [![][github-issues-shield]][github-issues-link]
 [![][github-license-shield]][github-license-link]
 <!-- [![PyPI](https://badge.fury.io/py/opencompass.svg)](https://pypi.org/project/opencompass/) -->
-[🌐Website](https://opencompass.org.cn/) |
+[🌐官方网站](https://opencompass.org.cn/) |
-[📘Documentation](https://opencompass.readthedocs.io/zh_CN/latest/index.html) |
+[📖数据集社区](https://hub.opencompass.org.cn/home) |
-[🛠️Installation](https://opencompass.readthedocs.io/zh_CN/latest/get_started.html#id1) |
+[📊性能榜单](https://rank.opencompass.org.cn/home) |
-[🤔Reporting Issues](https://github.com/open-compass/opencompass/issues/new/choose)
+[📘文档教程](https://opencompass.readthedocs.io/zh_CN/latest/index.html) |
 [🛠️安装](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html) |
 [🤔报告问题](https://github.com/open-compass/opencompass/issues/new/choose)
 [English](/README.md) | 简体中文
 [![][github-trending-shield]][github-trending-url]
 </div>
 <p align="center">
    👋 加入我们的 <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> 和 <a href="https://r.vansin.top/?r=opencompass" target="_blank">微信社区</a>
 </p>
 > \[!IMPORTANT\]
 >
 > **收藏项目**，你将能第一时间获取 OpenCompass 的最新动态～⭐️
 <details>
  <summary><kbd>Star History</kbd></summary>
  <picture>
    <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
    <img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
  </picture>
 </details>
 ## 🧭	欢迎
 来到**OpenCompass**！
 就像指南针在我们的旅程中为我们导航一样，我们希望OpenCompass能够帮助你穿越评估大型语言模型的重重迷雾。OpenCompass提供丰富的算法和功能支持，期待OpenCompass能够帮助社区更便捷地对NLP模型的性能进行公平全面的评估。
-> **🔥 注意**<br />
+🚩🚩🚩 欢迎加入 OpenCompass！我们目前**招聘全职研究人员/工程师和实习生**。如果您对 LLM 和 OpenCompass 充满热情，请随时通过[电子邮件](mailto:zhangsongyang@pjlab.org.cn)与我们联系。我们非常期待与您交流！
-> 我们正式启动 OpenCompass 共建计划，诚邀社区用户为 OpenCompass 提供更具代表性和可信度的客观评测数据集!
+
-> 点击 [Issue](https://github.com/open-compass/opencompass/issues/248) 获取更多数据集.
+🔥🔥🔥 祝贺 **OpenCompass 作为大模型标准测试工具被Meta AI官方推荐**, 点击 Llama 的 [入门文档](https://ai.meta.com/llama/get-started/#validation) 获取更多信息。
-> 让我们携手共进，打造功能强大易用的大模型评测平台！
+
 > **注意**<br />
 > 重要通知：从 v0.4.0 版本开始，所有位于 ./configs/datasets、./configs/models 和 ./configs/summarizers 目录下的 AMOTIC 配置文件将迁移至 opencompass 包中。请及时更新您的配置文件路径。
 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2023.09.20\]** 我们在评测榜单上更新了[InternLM-20B](https://github.com/InternLM/InternLM), 欢迎访问[官方网站](https://opencompass.org.cn)获取详情.🔥🔥🔥.
+- **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`，允许多个评估器按顺序工作，可以为更复杂的评估场景创建自定义评估流程，查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法！🔥🔥🔥
- **\[2023.09.19\]** 我们在评测榜单上更新了WeMix-LLaMA2-70B/Phi-1.5-1.3B, 欢迎访问[官方网站](https://opencompass.org.cn)获取详情.🔥🔥🔥.
+- **\[2025.03.11\]** 现已支持 `SuperGPQA`  覆盖285 个研究生学科的知识能力评测，欢迎尝试！🔥🔥🔥
- **\[2023.09.18\]** 我们发布了[长文本评测指引](docs/zh_cn/advanced_guides/longeval.md).🔥🔥🔥.
+- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
- **\[2023.09.08\]** 我们在评测榜单上更新了Baichuan-2/Tigerbot-2/Vicuna-v1.5, 欢迎访问[官方网站](https://opencompass.org.cn)获取详情。
+- **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHVerifyEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
- **\[2023.09.06\]** 欢迎 [**Baichuan2**](https://github.com/baichuan-inc/Baichuan2) 团队采用OpenCompass对模型进行系统评估。我们非常感谢社区在提升LLM评估的透明度和可复现性上所做的努力。
+- **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型，该模型在推理、知识类任务上取得同量级最优性能，欢迎尝试。
- **\[2023.09.02\]** 我们加入了[Qwen-VL](https://github.com/QwenLM/Qwen-VL)的评测支持。
+- **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py)，你可以通过简单地配置复现官方评测结果。
- **\[2023.08.25\]** 欢迎 [**TigerBot**](https://github.com/TigerResearch/TigerBot) 团队采用OpenCompass对模型进行系统评估。我们非常感谢社区在提升LLM评估的透明度和可复现性上所做的努力。
+- **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU)，欢迎尝试! 🔥🔥🔥
- **\[2023.08.21\]** [**Lagent**](https://github.com/InternLM/lagent) 正式发布，它是一个轻量级、开源的基于大语言模型的智能体（agent）框架。我们正与Lagent团队紧密合作，推进支持基于Lagent的大模型工具能力评测 !
+- **\[2024.09.19\]** 现已支持[Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) ，可以使用多种推理后端(huggingface/vllm/lmdeploy), 欢迎尝试! 🔥🔥🔥
 - **\[2024.09.05\]** 现已支持OpenAI o1 模型(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`), 欢迎尝试! 🔥🔥🔥
 - **\[2024.09.05\]** OpenCompass 现在支持通过模型后处理来进行答案提取，以更准确地展示模型的能力。作为此次更新的一部分，我们集成了 [XFinder](https://github.com/IAAR-Shanghai/xFinder) 作为首个后处理模型。具体信息请参阅 [文档](opencompass/utils/postprocessors/xfinder/README.md)，欢迎尝试！ 🔥🔥🔥
 - **\[2024.08.20\]** OpenCompass 现已支持 [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists。 🔥🔥🔥
 - **\[2024.08.16\]** OpenCompass 现已支持全新的长上下文语言模型评估基准——[RULER](https://arxiv.org/pdf/2404.06654)。RULER 通过灵活的配置，提供了对长上下文包括检索、多跳追踪、聚合和问答等多种任务类型的评测，欢迎访问[RULER](configs/datasets/ruler/README.md)。🔥🔥🔥
 - **\[2024.07.23\]** 我们支持了[Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)模型，欢迎试用！🔥🔥🔥
 - **\[2024.07.23\]** 我们支持了[ModelScope](www.modelscope.cn)数据集，您可以按需加载，无需事先下载全部数据到本地，欢迎试用！🔥🔥🔥
 - **\[2024.07.17\]** 我们发布了CompassBench-202407榜单的示例数据和评测规则，敬请访问 [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) 获取更多信息。 🔥🔥🔥
 - **\[2024.07.17\]** 我们正式发布 NeedleBench 的[技术报告](http://arxiv.org/abs/2407.11963)。诚邀您访问我们的[帮助文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/needleinahaystack_eval.html)进行评估。🔥🔥🔥
 - **\[2024.07.04\]** OpenCompass 现已支持 InternLM2.5， 它拥有卓越的推理性能、有效支持百万字超长上下文以及工具调用能力整体升级，欢迎访问[OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) 和 [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
 - **\[2024.06.20\]** OpenCompass 现已支持一键切换推理加速后端，助力评测过程更加高效。除了默认的HuggingFace推理后端外，还支持了常用的 [LMDeploy](https://github.com/InternLM/lmdeploy) 和 [vLLM](https://github.com/vllm-project/vllm) ，支持命令行一键切换和部署 API 加速服务两种方式，详细使用方法见[文档](docs/zh_cn/advanced_guides/accelerator_intro.md)。欢迎试用！🔥🔥🔥.
 > [更多](docs/zh_cn/notes/news.md)
 ## 📊 性能榜单
 我们将陆续提供开源模型和 API 模型的具体性能榜单，请见 [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) 。如需加入评测，请提供模型仓库地址或标准的 API 接口至邮箱  `opencompass@pjlab.org.cn`.
 你也可以参考[CompassAcademic](configs/eval_academic_leaderboard_202412.py)，快速地复现榜单的结果，目前选取的数据集包括 综合知识推理 (MMLU-Pro/GPQA Diamond) ,逻辑推理 (BBH) ,数学推理 (MATH-500, AIME) ,代码生成 (LiveCodeBench, HumanEval) ,指令跟随 (IFEval) 。
 <p align="right"><a href="#top">🔝返回顶部</a></p>
 ## 🛠️ 安装指南
 下面提供了快速安装和数据集准备的步骤。
 ### 💻 环境搭建
 我们强烈建议使用 `conda` 来管理您的 Python 环境。
 - #### 创建虚拟环境
  ```bash
  conda create --name opencompass python=3.10 -y
  conda activate opencompass
  ```
 - #### 通过pip安装OpenCompass
  ```bash
  # 支持绝大多数数据集及模型
  pip install -U opencompass
  # 完整安装（支持更多数据集）
  # pip install "opencompass[full]"
  # 模型推理后端，由于这些推理后端通常存在依赖冲突，建议使用不同的虚拟环境来管理它们。
  # pip install "opencompass[lmdeploy]"
  # pip install "opencompass[vllm]"
  # API 测试（例如 OpenAI、Qwen）
  # pip install "opencompass[api]"
  ```
 - #### 基于源码安装OpenCompass
  如果希望使用 OpenCompass 的最新功能，也可以从源代码构建它：
  ```bash
  git clone https://github.com/open-compass/opencompass opencompass
  cd opencompass
  pip install -e .
  # pip install -e ".[full]"
  # pip install -e ".[vllm]"
  ```
 ### 📂 数据准备
 #### 提前离线下载
 OpenCompass支持使用本地数据集进行评测，数据集的下载和解压可以通过以下命令完成：
 ```bash
 # 下载数据集到 data/ 处
 wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
 unzip OpenCompassData-core-20240207.zip
 ```
 #### 从 OpenCompass 自动下载
 我们已经支持从OpenCompass存储服务器自动下载数据集。您可以通过额外的 `--dry-run` 参数来运行评估以下载这些数据集。
 目前支持的数据集列表在[这里](https://github.com/open-compass/opencompass/blob/main/opencompass/utils/datasets_info.py#L259)。更多数据集将会很快上传。
 #### (可选) 使用 ModelScope 自动下载
 另外，您还可以使用[ModelScope](www.modelscope.cn)来加载数据集：
 环境准备：
 ```bash
 pip install modelscope
 export DATASET_SOURCE=ModelScope
 ```
 配置好环境后，无需下载全部数据，直接提交评测任务即可。目前支持的数据集有：
 ```bash
 humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli
 ```
 有部分第三方功能,如 Humaneval 以及 Llama,可能需要额外步骤才能正常运行，详细步骤请参考[安装指南](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html)。
 <p align="right"><a href="#top">🔝返回顶部</a></p>
 ## 🏗️ ️评测
 在确保按照上述步骤正确安装了 OpenCompass 并准备好了数据集之后，现在您可以开始使用 OpenCompass 进行首次评估！
 - ### 首次评测
  OpenCompass 支持通过命令行界面 (CLI) 或 Python 脚本来设置配置。对于简单的评估设置，我们推荐使用 CLI；而对于更复杂的评估，则建议使用脚本方式。你可以在examples文件夹下找到更多脚本示例。
  ```bash
  # 命令行界面 (CLI)
  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
  # Python 脚本
  opencompass examples/eval_chat_demo.py
  ```
  你可以在[examples](./examples) 文件夹下找到更多的脚本示例。
 - ### API评测
  OpenCompass 在设计上并不区分开源模型与 API 模型。您可以以相同的方式或甚至在同一设置中评估这两种类型的模型。
  ```bash
  export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
  # 命令行界面 (CLI)
  opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
  # Python 脚本
  opencompass  examples/eval_api_demo.py
  # 现已支持 o1_mini_2024_09_12/o1_preview_2024_09_12  模型, 默认情况下 max_completion_tokens=8192.
  ```
 - ### 推理后端
  另外，如果您想使用除 HuggingFace 之外的推理后端来进行加速评估，比如 LMDeploy 或 vLLM，可以通过以下命令进行。请确保您已经为所选的后端安装了必要的软件包，并且您的模型支持该后端的加速推理。更多信息，请参阅关于推理加速后端的文档 [这里](docs/zh_cn/advanced_guides/accelerator_intro.md)。以下是使用 LMDeploy 的示例：
  ```bash
  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
  ```
 - ### 支持的模型与数据集
  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
  ```bash
  # 列出所有配置
  python tools/list_configs.py
  # 列出所有跟 llama 及 mmlu 相关的配置
  python tools/list_configs.py llama mmlu
  ```
  #### 支持的模型
  如果模型不在列表中，但支持 Huggingface AutoModel 类或支持针对 OpenAI 接口的推理引擎封装（详见[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/new_model.html)），您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
  ```bash
  opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
  ```
  #### 支持的数据集
  目前，OpenCompass针对数据集给出了标准的推荐配置。通常，`_gen.py`或`_llm_judge_gen.py`为结尾的配置文件将指向我们为该数据集提供的推荐配置。您可以参阅[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节来获取详细信息。
  ```bash
  # 基于规则的推荐配置
  opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
  # 基于LLM Judge的推荐配置
  opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
  ```
  此外，如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
  ```bash
  CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
  ```
 > \[!TIP\]
 >
 > `--hf-num-gpus` 用于 模型并行(huggingface 格式)，`--max-num-worker` 用于数据并行。
 > \[!TIP\]
 >
 > configuration with `_ppl` is designed for base model typically.
 > 配置带 `_ppl` 的配置设计给基础模型使用。
 > 配置带 `_gen` 的配置可以同时用于基础模型和对话模型。
 通过命令行或配置文件，OpenCompass 还支持评测 API 或自定义模型，以及更多样化的评测策略。请阅读[快速开始](https://opencompass.readthedocs.io/zh_CN/latest/get_started/quick_start.html)了解如何运行一个评测任务。
 更多教程请查看我们的[文档](https://opencompass.readthedocs.io/zh_CN/latest/index.html)。
 <p align="right"><a href="#top">🔝返回顶部</a></p>
 ## 📣 OpenCompass 2.0
 我们很高兴发布 OpenCompass 司南 2.0 大模型评测体系，它主要由三大核心模块构建而成：[CompassKit](https://github.com/open-compass)、[CompassHub](https://hub.opencompass.org.cn/home)以及[CompassRank](https://rank.opencompass.org.cn/home)。
 **CompassRank** 系统进行了重大革新与提升，现已成为一个兼容并蓄的排行榜体系，不仅囊括了开源基准测试项目，还包含了私有基准测试。此番升级极大地拓宽了对行业内各类模型进行全面而深入测评的可能性。
 **CompassHub** 创新性地推出了一个基准测试资源导航平台，其设计初衷旨在简化和加快研究人员及行业从业者在多样化的基准测试库中进行搜索与利用的过程。为了让更多独具特色的基准测试成果得以在业内广泛传播和应用，我们热忱欢迎各位将自定义的基准数据贡献至CompassHub平台。只需轻点鼠标，通过访问[这里](https://hub.opencompass.org.cn/dataset-submit)，即可启动提交流程。
 **CompassKit** 是一系列专为大型语言模型和大型视觉-语言模型打造的强大评估工具合集，它所提供的全面评测工具集能够有效地对这些复杂模型的功能性能进行精准测量和科学评估。在此，我们诚挚邀请您在学术研究或产品研发过程中积极尝试运用我们的工具包，以助您取得更加丰硕的研究成果和产品优化效果。
 ## ✨ 介绍
 ![image](https://github.com/open-compass/opencompass/assets/22607038/30bcb2e2-3969-4ac5-9f29-ad3f4abb4f3b)
@ -53,7 +280,7 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
 - **开源可复现**：提供公平、公开、可复现的大模型评测方案
- **全面的能力维度**：五大维度设计，提供 50+ 个数据集约 30 万题的的模型评测方案，全面评估模型能力
+- **全面的能力维度**：五大维度设计，提供 70+ 个数据集约 40 万题的的模型评测方案，全面评估模型能力
 - **丰富的模型支持**：已支持 20+ HuggingFace 及 API 模型
@ -63,198 +290,13 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
 - **灵活化拓展**：想增加新模型或数据集？想要自定义更高级的任务分割策略，甚至接入新的集群管理系统？OpenCompass 的一切均可轻松扩展！
 ## 📊 性能榜单
 我们将陆续提供开源模型和API模型的具体性能榜单，请见 [OpenCompass Leaderbaord](https://opencompass.org.cn/rank) 。如需加入评测，请提供模型仓库地址或标准的 API 接口至邮箱  `opencompass@pjlab.org.cn`.
 <p align="right"><a href="#top">🔝返回顶部</a></p>
 ## 📖 数据集支持
-<table align="center">
+我们已经在OpenCompass官网的文档中支持了所有可在本平台上使用的数据集的统计列表。
  <tbody>
    <tr align="center" valign="bottom">
      <td>
        <b>语言</b>
      </td>
      <td>
        <b>知识</b>
      </td>
      <td>
        <b>推理</b>
      </td>
      <td>
        <b>学科</b>
      </td>
      <td>
        <b>理解</b>
      </td>
    </tr>
    <tr valign="top">
      <td>
 <details open>
 <summary><b>字词释义</b></summary>
- WiC
+您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
 - SummEdits
-</details>
+详情请参阅 [官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节。
 <details open>
 <summary><b>成语习语</b></summary>
 - CHID
 </details>
 <details open>
 <summary><b>语义相似度</b></summary>
 - AFQMC
 - BUSTM
 </details>
 <details open>
 <summary><b>指代消解</b></summary>
 - CLUEWSC
 - WSC
 - WinoGrande
 </details>
 <details open>
 <summary><b>翻译</b></summary>
 - Flores
 </details>
      </td>
      <td>
 <details open>
 <summary><b>知识问答</b></summary>
 - BoolQ
 - CommonSenseQA
 - NaturalQuestion
 - TrivialQA
 </details>
 <details open>
 <summary><b>多语种问答</b></summary>
 - TyDi-QA
 </details>
      </td>
      <td>
 <details open>
 <summary><b>文本蕴含</b></summary>
 - CMNLI
 - OCNLI
 - OCNLI_FC
 - AX-b
 - AX-g
 - CB
 - RTE
 </details>
 <details open>
 <summary><b>常识推理</b></summary>
 - StoryCloze
 - StoryCloze-CN（即将上线）
 - COPA
 - ReCoRD
 - HellaSwag
 - PIQA
 - SIQA
 </details>
 <details open>
 <summary><b>数学推理</b></summary>
 - MATH
 - GSM8K
 </details>
 <details open>
 <summary><b>定理应用</b></summary>
 - TheoremQA
 </details>
 <details open>
 <summary><b>代码</b></summary>
 - HumanEval
 - MBPP
 </details>
 <details open>
 <summary><b>综合推理</b></summary>
 - BBH
 </details>
      </td>
      <td>
 <details open>
 <summary><b>初中/高中/大学/职业考试</b></summary>
 - GAOKAO-2023
 - CEval
 - AGIEval
 - MMLU
 - GAOKAO-Bench
 - CMMLU
 - ARC
 </details>
      </td>
      <td>
 <details open>
 <summary><b>阅读理解</b></summary>
 - C3
 - CMRC
 - DRCD
 - MultiRC
 - RACE
 </details>
 <details open>
 <summary><b>内容总结</b></summary>
 - CSL
 - LCSTS
 - XSum
 </details>
 <details open>
 <summary><b>内容分析</b></summary>
 - EPRSTMT
 - LAMBADA
 - TNEWS
 </details>
      </td>
    </tr>
 </td>
    </tr>
  </tbody>
 </table>
 <p align="right"><a href="#top">🔝返回顶部</a></p>
@ -276,113 +318,78 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
    <tr valign="top">
      <td>
- LLaMA
+- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
- Vicuna
+- [Baichuan](https://github.com/baichuan-inc)
- Alpaca
+- [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
- Baichuan
+- [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)
- WizardLM
+- [ChatGLM3](https://github.com/THUDM/ChatGLM3-6B)
- ChatGLM-6B
+- [Gemma](https://huggingface.co/google/gemma-7b)
- ChatGLM2-6B
+- [InternLM](https://github.com/InternLM/InternLM)
- MPT
+- [LLaMA](https://github.com/facebookresearch/llama)
- Falcon
+- [LLaMA3](https://github.com/meta-llama/llama3)
- TigerBot
+- [Qwen](https://github.com/QwenLM/Qwen)
- MOSS
+- [TigerBot](https://github.com/TigerResearch/TigerBot)
 - [Vicuna](https://github.com/lm-sys/FastChat)
 - [WizardLM](https://github.com/nlpxucan/WizardLM)
 - [Yi](https://github.com/01-ai/Yi)
 - ……
 </td>
 <td>
 - OpenAI
- Claude (即将推出)
+- Gemini
- PaLM (即将推出)
+- Claude
 - ZhipuAI(ChatGLM)
 - Baichuan
 - ByteDance(YunQue)
 - Huawei(PanGu)
 - 360
 - Baidu(ERNIEBot)
 - MiniMax(ABAB-Chat)
 - SenseTime(nova)
 - Xunfei(Spark)
 - ……
 </td>
 <!-- <td>
 - GLM
 - ……
 </td> -->
 </tr>
  </tbody>
 </table>
 ## 🛠️ 安装
 下面展示了快速安装以及准备数据集的步骤。
 ```Python
 conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
 conda activate opencompass
 git clone https://github.com/open-compass/opencompass opencompass
 cd opencompass
 pip install -e .
 # 下载数据集到 data/ 处
 wget https://github.com/open-compass/opencompass/releases/download/0.1.1/OpenCompassData.zip
 unzip OpenCompassData.zip
 ```
 有部分第三方功能,如 Humaneval 以及 Llama,可能需要额外步骤才能正常运行，详细步骤请参考[安装指南](https://opencompass.readthedocs.io/zh_CN/latest/get_started.html)。
 <p align="right"><a href="#top">🔝返回顶部</a></p>
 ## 🏗️ ️评测
 确保按照上述步骤正确安装 OpenCompass 并准备好数据集后，可以通过以下命令评测 LLaMA-7b 模型在 MMLU 和 C-Eval 数据集上的性能：
 ```bash
 python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
 ```
 OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
 ```bash
 # 列出所有配置
 python tools/list_configs.py
 # 列出所有跟 llama 及 mmlu 相关的配置
 python tools/list_configs.py llama mmlu
 ```
 你也可以通过命令行去评测其它 HuggingFace 模型。同样以 LLaMA-7b 为例：
 ```bash
 python run.py --datasets ceval_ppl mmlu_ppl \
 --hf-path huggyllama/llama-7b \  # HuggingFace 模型地址
 --model-kwargs device_map='auto' \  # 构造 model 的参数
 --tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \  # 构造 tokenizer 的参数
 --max-out-len 100 \  # 最长生成 token 数
 --max-seq-len 2048 \  # 模型能接受的最大序列长度
 --batch-size 8 \  # 批次大小
 --no-batch-padding \  # 不打开 batch padding，通过 for loop 推理，避免精度损失
 --num-gpus 1  # 所需 gpu 数
 ```
 通过命令行或配置文件，OpenCompass 还支持评测 API 或自定义模型，以及更多样化的评测策略。请阅读[快速上手](https://opencompass.readthedocs.io/zh_CN/latest/get_started.html#id3)了解如何运行一个评测任务。
 更多教程请查看我们的[文档](https://opencompass.readthedocs.io/zh_CN/latest/index.html)。
 ## 🔜 路线图
- [ ] 主观评测
+- [x] 主观评测
-  - [ ] 发布主观评测榜单
+  - [x] 发布主观评测榜单
-  - [ ] 发布主观评测数据集
+  - [x] 发布主观评测数据集
- [ ] 长文本
+- [x] 长文本
-  - [ ] 支持广泛的长文本评测集
+  - [x] 支持广泛的长文本评测集
  - [ ] 发布长文本评测榜单
- [ ] 代码能力
+- [x] 代码能力
  - [ ] 发布代码能力评测榜单
-  - [ ] 提供非Python语言的评测服务
+  - [x] 提供非Python语言的评测服务
- [ ] 智能体
+- [x] 智能体
  - [ ] 支持丰富的智能体方案
-  - [ ] 提供智能体评测榜单
+  - [x] 提供智能体评测榜单
- [ ] 鲁棒性
+- [x] 鲁棒性
-  - [ ] 支持各类攻击方法
+  - [x] 支持各类攻击方法
 ## 👷‍♂️ 贡献
 我们感谢所有的贡献者为改进和提升 OpenCompass 所作出的努力。请参考[贡献指南](https://opencompass.readthedocs.io/zh_CN/latest/notes/contribution_guide.html)来了解参与项目贡献的相关指引。
 <a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
  <table>
    <tr>
      <th colspan="2">
        <br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
      </th>
    </tr>
  </table>
 </a>
 ## 🤝 致谢
 该项目部分的代码引用并修改自 [OpenICL](https://github.com/Shark-NLP/OpenICL)。
@ -401,3 +408,20 @@ python run.py --datasets ceval_ppl mmlu_ppl \
 ```
 <p align="right"><a href="#top">🔝返回顶部</a></p>
 [github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
 [github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
 [github-forks-link]: https://github.com/open-compass/opencompass/network/members
 [github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
 [github-issues-link]: https://github.com/open-compass/opencompass/issues
 [github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
 [github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
 [github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
 [github-release-link]: https://github.com/open-compass/opencompass/releases
 [github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
 [github-releasedate-link]: https://github.com/open-compass/opencompass/releases
 [github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
 [github-stars-link]: https://github.com/open-compass/opencompass/stargazers
 [github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
 [github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
 [github-trending-url]: https://trendshift.io/repositories/6630
--- a/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py
+++ b/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py
@ -1,40 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
 TheoremQA_reader_cfg = dict(
    input_columns=['Question', 'Answer_type'],
    output_column='Answer',
    train_split='test')
 TheoremQA_prompt1 = "Please read a math problem, and then think step by step to derive the answer. The answer is decided by Answer Type. " \
         "If the Answer type in [bool], the answer needs to be True or False. " \
         "Else if the Answer type in [integer, float] , The answer needs to be in numerical form. " \
         "Else if the Answer type in [list of integer, list of float] , the answer needs to be a list of number like [2, 3, 4]. " \
         "Else if the Answer type in [option], the answer needs to be an option like (a), (b), (c), (d)." \
         "You need to output the answer in your final sentence like 'Therefore, the answer is ...'."
 TheoremQA_prompt2 = f"Below is an instruction that describes a task, paired with an input that provides further context. " \
         f"Write a response that appropriately completes the request.\n\n### Instruction:\n{TheoremQA_prompt1}\n\n### Input:\n{{Question}}\nAnswer_type:{{Answer_type}}\n### Response:\n"
 TheoremQA_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=TheoremQA_prompt2),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512))
 TheoremQA_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=TheoremQA_postprocess))
 TheoremQA_datasets = [
    dict(
        abbr='TheoremQA',
        type=TheoremQADataset,
        path="./data/TheoremQA/test.csv",
        reader_cfg=TheoremQA_reader_cfg,
        infer_cfg=TheoremQA_infer_cfg,
        eval_cfg=TheoremQA_eval_cfg)
 ]
--- a/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py
+++ b/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py
@ -1,37 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
 TheoremQA_reader_cfg = dict(
    input_columns=['Question', 'Answer_type'],
    output_column='Answer',
    train_split='test')
 TheoremQA_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[
            dict(
                role='HUMAN',
                prompt=
                """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:\n1. a numerical value like 0.1, no symbol and no unit at all.\n2. a list of number like [2, 3, 4].\n3. True/False.\n4. an option like (a), (b), (c), (d)\nQuestion: {Question}\nLet\'s think step by step."""
            ),
        ])),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512))
 TheoremQA_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=TheoremQA_postprocess))
 TheoremQA_datasets = [
    dict(
        abbr='TheoremQA',
        type=TheoremQADataset,
        path="./data/TheoremQA/test.csv",
        reader_cfg=TheoremQA_reader_cfg,
        infer_cfg=TheoremQA_infer_cfg,
        eval_cfg=TheoremQA_eval_cfg)
 ]
--- a/configs/datasets/apps/apps_gen.py
+++ b/configs/datasets/apps/apps_gen.py
@ -1,4 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .apps_gen_7fbb95 import apps_datasets  # noqa: F401, F403
--- a/configs/datasets/bbh/bbh_gen.py
+++ b/configs/datasets/bbh/bbh_gen.py
@ -1,4 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .bbh_gen_5b92b0 import bbh_datasets  # noqa: F401, F403
--- a/configs/datasets/ceval/ceval_gen_2daf24.py
+++ b/configs/datasets/ceval/ceval_gen_2daf24.py
@ -1,188 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import CEvalDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 ceval_subject_mapping = {
    "computer_network":
    ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"],
    "operating_system":
    ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"],
    "computer_architecture":
    ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"],
    "college_programming":
    ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"],
    "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"],
    "college_chemistry":
    ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"],
    "advanced_mathematics":
    ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"],
    "probability_and_statistics":
    ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"],
    "discrete_mathematics":
    ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"],
    "electrical_engineer": [
        "Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08",
        "STEM"
    ],
    "metrology_engineer":
    ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"],
    "high_school_mathematics":
    ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"],
    "high_school_physics":
    ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"],
    "high_school_chemistry":
    ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"],
    "high_school_biology": [
        "High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"
    ],
    "middle_school_mathematics": [
        "Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"
    ],
    "middle_school_biology": [
        "Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"
    ],
    "middle_school_physics": [
        "Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"
    ],
    "middle_school_chemistry": [
        "Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"
    ],
    "veterinary_medicine": [
        "Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"
    ],
    "college_economics": [
        "College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"
    ],
    "business_administration": [
        "Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"
    ],
    "marxism": [
        "Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406",
        "Social Science"
    ],
    "mao_zedong_thought": [
        "Mao Zedong Thought",
        "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba",
        "Social Science"
    ],
    "education_science": [
        "Education Science", "\u6559\u80b2\u5b66", "Social Science"
    ],
    "teacher_qualification": [
        "Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"
    ],
    "high_school_politics": [
        "High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"
    ],
    "high_school_geography": [
        "High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"
    ],
    "middle_school_politics": [
        "Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"
    ],
    "middle_school_geography": [
        "Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"
    ],
    "modern_chinese_history":
    ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"],
    "ideological_and_moral_cultivation": [
        "Ideological and Moral Cultivation",
        "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840",
        "Humanities"
    ],
    "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"],
    "law": ["Law", "\u6cd5\u5b66", "Humanities"],
    "chinese_language_and_literature": [
        "Chinese Language and Literature",
        "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"
    ],
    "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"],
    "professional_tour_guide": [
        "Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"
    ],
    "legal_professional": [
        "Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c",
        "Humanities"
    ],
    "high_school_chinese": [
        "High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"
    ],
    "high_school_history": [
        "High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"
    ],
    "middle_school_history": [
        "Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"
    ],
    "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"],
    "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"],
    "plant_protection": [
        "Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"
    ],
    "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"],
    "clinical_medicine": [
        "Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"
    ],
    "urban_and_rural_planner": [
        "Urban and Rural Planner",
        "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"
    ],
    "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"],
    "fire_engineer": [
        "Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"
    ],
    "environmental_impact_assessment_engineer": [
        "Environmental Impact Assessment Engineer",
        "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"
    ],
    "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"],
    "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"]
 }
 ceval_all_sets = list(ceval_subject_mapping.keys())
 ceval_datasets = []
 for _split in ["val", "test"]:
    for _name in ceval_all_sets:
        _ch_name = ceval_subject_mapping[_name][1]
        ceval_infer_cfg = dict(
            ice_template=dict(
                type=PromptTemplate,
                template=dict(
                    begin="</E>",
                    round=[
                        dict(
                            role="HUMAN",
                            prompt=
                            f"以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: "
                        ),
                        dict(role="BOT", prompt="{answer}"),
                    ]),
                ice_token="</E>",
            ),
            retriever=dict(type=FixKRetriever),
            inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]),
        )
        ceval_eval_cfg = dict(
            evaluator=dict(type=AccEvaluator),
            pred_postprocessor=dict(type=first_capital_postprocess))
        ceval_datasets.append(
            dict(
                type=CEvalDataset,
                path="./data/ceval/formal_ceval",
                name=_name,
                abbr="ceval-" + _name if _split == "val" else "ceval-test-" +
                _name,
                reader_cfg=dict(
                    input_columns=["question", "A", "B", "C", "D"],
                    output_column="answer",
                    train_split="dev",
                    test_split=_split),
                infer_cfg=ceval_infer_cfg,
                eval_cfg=ceval_eval_cfg,
            ))
 del _split, _name, _ch_name
--- a/configs/datasets/ceval/ceval_gen_5f30c7.py
+++ b/configs/datasets/ceval/ceval_gen_5f30c7.py
@ -1,188 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import CEvalDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 ceval_subject_mapping = {
    "computer_network":
    ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"],
    "operating_system":
    ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"],
    "computer_architecture":
    ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"],
    "college_programming":
    ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"],
    "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"],
    "college_chemistry":
    ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"],
    "advanced_mathematics":
    ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"],
    "probability_and_statistics":
    ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"],
    "discrete_mathematics":
    ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"],
    "electrical_engineer": [
        "Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08",
        "STEM"
    ],
    "metrology_engineer":
    ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"],
    "high_school_mathematics":
    ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"],
    "high_school_physics":
    ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"],
    "high_school_chemistry":
    ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"],
    "high_school_biology": [
        "High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"
    ],
    "middle_school_mathematics": [
        "Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"
    ],
    "middle_school_biology": [
        "Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"
    ],
    "middle_school_physics": [
        "Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"
    ],
    "middle_school_chemistry": [
        "Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"
    ],
    "veterinary_medicine": [
        "Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"
    ],
    "college_economics": [
        "College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"
    ],
    "business_administration": [
        "Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"
    ],
    "marxism": [
        "Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406",
        "Social Science"
    ],
    "mao_zedong_thought": [
        "Mao Zedong Thought",
        "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba",
        "Social Science"
    ],
    "education_science": [
        "Education Science", "\u6559\u80b2\u5b66", "Social Science"
    ],
    "teacher_qualification": [
        "Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"
    ],
    "high_school_politics": [
        "High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"
    ],
    "high_school_geography": [
        "High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"
    ],
    "middle_school_politics": [
        "Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"
    ],
    "middle_school_geography": [
        "Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"
    ],
    "modern_chinese_history":
    ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"],
    "ideological_and_moral_cultivation": [
        "Ideological and Moral Cultivation",
        "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840",
        "Humanities"
    ],
    "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"],
    "law": ["Law", "\u6cd5\u5b66", "Humanities"],
    "chinese_language_and_literature": [
        "Chinese Language and Literature",
        "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"
    ],
    "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"],
    "professional_tour_guide": [
        "Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"
    ],
    "legal_professional": [
        "Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c",
        "Humanities"
    ],
    "high_school_chinese": [
        "High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"
    ],
    "high_school_history": [
        "High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"
    ],
    "middle_school_history": [
        "Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"
    ],
    "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"],
    "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"],
    "plant_protection": [
        "Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"
    ],
    "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"],
    "clinical_medicine": [
        "Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"
    ],
    "urban_and_rural_planner": [
        "Urban and Rural Planner",
        "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"
    ],
    "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"],
    "fire_engineer": [
        "Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"
    ],
    "environmental_impact_assessment_engineer": [
        "Environmental Impact Assessment Engineer",
        "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"
    ],
    "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"],
    "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"]
 }
 ceval_all_sets = list(ceval_subject_mapping.keys())
 ceval_datasets = []
 for _split in ["val"]:
    for _name in ceval_all_sets:
        _ch_name = ceval_subject_mapping[_name][1]
        ceval_infer_cfg = dict(
            ice_template=dict(
                type=PromptTemplate,
                template=dict(
                    begin="</E>",
                    round=[
                        dict(
                            role="HUMAN",
                            prompt=
                            f"以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: "
                        ),
                        dict(role="BOT", prompt="{answer}"),
                    ]),
                ice_token="</E>",
            ),
            retriever=dict(type=FixKRetriever),
            inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]),
        )
        ceval_eval_cfg = dict(
            evaluator=dict(type=AccEvaluator),
            pred_postprocessor=dict(type=first_capital_postprocess))
        ceval_datasets.append(
            dict(
                type=CEvalDataset,
                path="./data/ceval/formal_ceval",
                name=_name,
                abbr="ceval-" + _name if _split == "val" else "ceval-test-" +
                _name,
                reader_cfg=dict(
                    input_columns=["question", "A", "B", "C", "D"],
                    output_column="answer",
                    train_split="dev",
                    test_split=_split),
                infer_cfg=ceval_infer_cfg,
                eval_cfg=ceval_eval_cfg,
            ))
 del _split, _name, _ch_name
--- a/configs/datasets/ceval/ceval_ppl_578f8d.py
+++ b/configs/datasets/ceval/ceval_ppl_578f8d.py
@ -1,188 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import CEvalDataset
 ceval_subject_mapping = {
    "computer_network":
    ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"],
    "operating_system":
    ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"],
    "computer_architecture":
    ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"],
    "college_programming":
    ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"],
    "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"],
    "college_chemistry":
    ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"],
    "advanced_mathematics":
    ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"],
    "probability_and_statistics":
    ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"],
    "discrete_mathematics":
    ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"],
    "electrical_engineer": [
        "Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08",
        "STEM"
    ],
    "metrology_engineer":
    ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"],
    "high_school_mathematics":
    ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"],
    "high_school_physics":
    ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"],
    "high_school_chemistry":
    ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"],
    "high_school_biology": [
        "High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"
    ],
    "middle_school_mathematics": [
        "Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"
    ],
    "middle_school_biology": [
        "Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"
    ],
    "middle_school_physics": [
        "Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"
    ],
    "middle_school_chemistry": [
        "Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"
    ],
    "veterinary_medicine": [
        "Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"
    ],
    "college_economics": [
        "College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"
    ],
    "business_administration": [
        "Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"
    ],
    "marxism": [
        "Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406",
        "Social Science"
    ],
    "mao_zedong_thought": [
        "Mao Zedong Thought",
        "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba",
        "Social Science"
    ],
    "education_science": [
        "Education Science", "\u6559\u80b2\u5b66", "Social Science"
    ],
    "teacher_qualification": [
        "Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"
    ],
    "high_school_politics": [
        "High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"
    ],
    "high_school_geography": [
        "High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"
    ],
    "middle_school_politics": [
        "Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"
    ],
    "middle_school_geography": [
        "Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"
    ],
    "modern_chinese_history":
    ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"],
    "ideological_and_moral_cultivation": [
        "Ideological and Moral Cultivation",
        "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840",
        "Humanities"
    ],
    "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"],
    "law": ["Law", "\u6cd5\u5b66", "Humanities"],
    "chinese_language_and_literature": [
        "Chinese Language and Literature",
        "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"
    ],
    "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"],
    "professional_tour_guide": [
        "Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"
    ],
    "legal_professional": [
        "Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c",
        "Humanities"
    ],
    "high_school_chinese": [
        "High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"
    ],
    "high_school_history": [
        "High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"
    ],
    "middle_school_history": [
        "Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"
    ],
    "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"],
    "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"],
    "plant_protection": [
        "Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"
    ],
    "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"],
    "clinical_medicine": [
        "Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"
    ],
    "urban_and_rural_planner": [
        "Urban and Rural Planner",
        "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"
    ],
    "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"],
    "fire_engineer": [
        "Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"
    ],
    "environmental_impact_assessment_engineer": [
        "Environmental Impact Assessment Engineer",
        "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"
    ],
    "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"],
    "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"]
 }
 ceval_all_sets = list(ceval_subject_mapping.keys())
 ceval_datasets = []
 for _split in ["val"]:
    for _name in ceval_all_sets:
        _ch_name = ceval_subject_mapping[_name][1]
        ceval_infer_cfg = dict(
            ice_template=dict(
                type=PromptTemplate,
                template={
                    answer: dict(
                        begin="</E>",
                        round=[
                            dict(
                                role="HUMAN",
                                prompt=
                                f"以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: "
                            ),
                            dict(role="BOT", prompt=answer),
                        ])
                    for answer in ["A", "B", "C", "D"]
                },
                ice_token="</E>",
            ),
            retriever=dict(type=FixKRetriever),
            inferencer=dict(type=PPLInferencer, fix_id_list=[0, 1, 2, 3, 4]),
        )
        ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
        ceval_datasets.append(
            dict(
                type=CEvalDataset,
                path="./data/ceval/formal_ceval",
                name=_name,
                abbr="ceval-" + _name if _split == "val" else "ceval-test-" +
                _name,
                reader_cfg=dict(
                    input_columns=["question", "A", "B", "C", "D"],
                    output_column="answer",
                    train_split="dev",
                    test_split=_split),
                infer_cfg=ceval_infer_cfg,
                eval_cfg=ceval_eval_cfg,
            ))
 del _split, _name, _ch_name
--- a/configs/datasets/ceval/ceval_ppl_93e5ce.py
+++ b/configs/datasets/ceval/ceval_ppl_93e5ce.py
@ -1,188 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import CEvalDataset
 ceval_subject_mapping = {
    "computer_network":
    ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"],
    "operating_system":
    ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"],
    "computer_architecture":
    ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"],
    "college_programming":
    ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"],
    "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"],
    "college_chemistry":
    ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"],
    "advanced_mathematics":
    ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"],
    "probability_and_statistics":
    ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"],
    "discrete_mathematics":
    ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"],
    "electrical_engineer": [
        "Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08",
        "STEM"
    ],
    "metrology_engineer":
    ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"],
    "high_school_mathematics":
    ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"],
    "high_school_physics":
    ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"],
    "high_school_chemistry":
    ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"],
    "high_school_biology": [
        "High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"
    ],
    "middle_school_mathematics": [
        "Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"
    ],
    "middle_school_biology": [
        "Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"
    ],
    "middle_school_physics": [
        "Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"
    ],
    "middle_school_chemistry": [
        "Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"
    ],
    "veterinary_medicine": [
        "Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"
    ],
    "college_economics": [
        "College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"
    ],
    "business_administration": [
        "Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"
    ],
    "marxism": [
        "Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406",
        "Social Science"
    ],
    "mao_zedong_thought": [
        "Mao Zedong Thought",
        "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba",
        "Social Science"
    ],
    "education_science": [
        "Education Science", "\u6559\u80b2\u5b66", "Social Science"
    ],
    "teacher_qualification": [
        "Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"
    ],
    "high_school_politics": [
        "High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"
    ],
    "high_school_geography": [
        "High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"
    ],
    "middle_school_politics": [
        "Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"
    ],
    "middle_school_geography": [
        "Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"
    ],
    "modern_chinese_history":
    ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"],
    "ideological_and_moral_cultivation": [
        "Ideological and Moral Cultivation",
        "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840",
        "Humanities"
    ],
    "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"],
    "law": ["Law", "\u6cd5\u5b66", "Humanities"],
    "chinese_language_and_literature": [
        "Chinese Language and Literature",
        "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"
    ],
    "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"],
    "professional_tour_guide": [
        "Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"
    ],
    "legal_professional": [
        "Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c",
        "Humanities"
    ],
    "high_school_chinese": [
        "High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"
    ],
    "high_school_history": [
        "High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"
    ],
    "middle_school_history": [
        "Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"
    ],
    "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"],
    "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"],
    "plant_protection": [
        "Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"
    ],
    "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"],
    "clinical_medicine": [
        "Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"
    ],
    "urban_and_rural_planner": [
        "Urban and Rural Planner",
        "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"
    ],
    "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"],
    "fire_engineer": [
        "Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"
    ],
    "environmental_impact_assessment_engineer": [
        "Environmental Impact Assessment Engineer",
        "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"
    ],
    "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"],
    "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"]
 }
 ceval_all_sets = list(ceval_subject_mapping.keys())
 ceval_datasets = []
 for _split in ["val", "test"]:
    for _name in ceval_all_sets:
        _ch_name = ceval_subject_mapping[_name][1]
        ceval_infer_cfg = dict(
            ice_template=dict(
                type=PromptTemplate,
                template={
                    answer: dict(
                        begin="</E>",
                        round=[
                            dict(
                                role="HUMAN",
                                prompt=
                                f"以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: "
                            ),
                            dict(role="BOT", prompt=answer),
                        ])
                    for answer in ["A", "B", "C", "D"]
                },
                ice_token="</E>",
            ),
            retriever=dict(type=FixKRetriever),
            inferencer=dict(type=PPLInferencer, fix_id_list=[0, 1, 2, 3, 4]),
        )
        ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
        ceval_datasets.append(
            dict(
                type=CEvalDataset,
                path="./data/ceval/formal_ceval",
                name=_name,
                abbr="ceval-" + _name if _split == "val" else "ceval-test-" +
                _name,
                reader_cfg=dict(
                    input_columns=["question", "A", "B", "C", "D"],
                    output_column="answer",
                    train_split="dev",
                    test_split=_split),
                infer_cfg=ceval_infer_cfg,
                eval_cfg=ceval_eval_cfg,
            ))
 del _split, _name, _ch_name
--- a/configs/datasets/cmb/cmb_gen.py
+++ b/configs/datasets/cmb/cmb_gen.py
@ -1,4 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .cmb_gen_72cbb7 import cmb_datasets  # noqa: F401, F403
--- a/configs/datasets/cmb/cmb_gen_72cbb7.py
+++ b/configs/datasets/cmb/cmb_gen_72cbb7.py
@ -1,43 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import CMBDataset
 cmb_datasets = []
 cmb_reader_cfg = dict(
    input_columns=["exam_type", "exam_class", "question_type", "question", "option_str"],
    output_column=None,
    train_split="val",
    test_split="test"
 )
 cmb_infer_cfg = dict(
    ice_template=dict(
        type=PromptTemplate,
        template=dict(
            begin="</E>",
            round=[
                dict(
                    role="HUMAN",
                    prompt=f"以下是中国{{exam_type}}中{{exam_class}}考试的一道{{question_type}}，不需要做任何分析和解释，直接输出答案选项。\n{{question}}\n{{option_str}} \n 答案: ",
                ),
                dict(role="BOT", prompt="{answer}"),
            ],
        ),
        ice_token="</E>",
    ),
    retriever=dict(type=FixKRetriever),
    inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]),
 )
 cmb_datasets.append(
    dict(
        type=CMBDataset,
        path="./data/CMB/",
        abbr="cmb",
        reader_cfg=cmb_reader_cfg,
        infer_cfg=cmb_infer_cfg
    )
 )
--- a/configs/datasets/cmmlu/cmmlu_gen.py
+++ b/configs/datasets/cmmlu/cmmlu_gen.py
@ -1,4 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .cmmlu_gen_c13365 import cmmlu_datasets  # noqa: F401, F403
--- a/configs/datasets/drop/drop_gen.py
+++ b/configs/datasets/drop/drop_gen.py
@ -1,4 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .drop_gen_599f07 import drop_datasets  # noqa: F401, F403
--- a/configs/datasets/flores/flores_gen_806ede.py
+++ b/configs/datasets/flores/flores_gen_806ede.py
@ -1,161 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import TopkRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import BleuEvaluator
 from opencompass.datasets import FloresFirst100Dataset
 _flores_lang_map = [
    ["eng", "eng_Latn", "English", "Indo-European-Germanic"],
    ["afr", "afr_Latn", "Afrikaans", "Indo-European-Germanic"],
    ["dan", "dan_Latn", "Danish", "Indo-European-Germanic"],
    ["deu", "deu_Latn", "German", "Indo-European-Germanic"],
    ["isl", "isl_Latn", "Icelandic", "Indo-European-Germanic"],
    ["ltz", "ltz_Latn", "Luxembourgish", "Indo-European-Germanic"],
    ["nld", "nld_Latn", "Dutch", "Indo-European-Germanic"],
    ["nob", "nob_Latn", "Norwegian", "Indo-European-Germanic"],
    ["swe", "swe_Latn", "Swedish", "Indo-European-Germanic"],
    ["ast", "ast_Latn", "Asturian", "Indo-European-Romance"],
    ["cat", "cat_Latn", "Catalan", "Indo-European-Romance"],
    ["fra", "fra_Latn", "French", "Indo-European-Romance"],
    ["glg", "glg_Latn", "Galician", "Indo-European-Romance"],
    ["oci", "oci_Latn", "Occitan", "Indo-European-Romance"],
    ["por", "por_Latn", "Portuguese", "Indo-European-Romance"],
    ["ron", "ron_Latn", "Romanian", "Indo-European-Romance"],
    ["spa", "spa_Latn", "Spanish", "Indo-European-Romance"],
    ["bel", "bel_Cyrl", "Belarusian", "Indo-European-Slavic"],
    ["bos", "bos_Latn", "Bosnian", "Indo-European-Slavic"],
    ["bul", "bul_Cyrl", "Bulgarian", "Indo-European-Slavic"],
    ["ces", "ces_Latn", "Czech", "Indo-European-Slavic"],
    ["hrv", "hrv_Latn", "Croatian", "Indo-European-Slavic"],
    ["mkd", "mkd_Cyrl", "Macedonian", "Indo-European-Slavic"],
    ["pol", "pol_Latn", "Polish", "Indo-European-Slavic"],
    ["rus", "rus_Cyrl", "Russian", "Indo-European-Slavic"],
    ["slk", "slk_Latn", "Slovak", "Indo-European-Slavic"],
    ["slv", "slv_Latn", "Slovenian", "Indo-European-Slavic"],
    ["srp", "srp_Cyrl", "Serbian", "Indo-European-Slavic"],
    ["ukr", "ukr_Cyrl", "Ukrainian", "Indo-European-Slavic"],
    ["asm", "asm_Beng", "Assamese", "Indo-European-Indo-Aryan"],
    ["ben", "ben_Beng", "Bengali", "Indo-European-Indo-Aryan"],
    ["guj", "guj_Gujr", "Gujarati", "Indo-European-Indo-Aryan"],
    ["hin", "hin_Deva", "Hindi", "Indo-European-Indo-Aryan"],
    ["mar", "mar_Deva", "Marathi", "Indo-European-Indo-Aryan"],
    ["npi", "npi_Deva", "Nepali", "Indo-European-Indo-Aryan"],
    ["ory", "ory_Orya", "Oriya", "Indo-European-Indo-Aryan"],
    ["pan", "pan_Guru", "Punjabi", "Indo-European-Indo-Aryan"],
    ["snd", "snd_Arab", "Sindhi", "Indo-European-Indo-Aryan"],
    ["urd", "urd_Arab", "Urdu", "Indo-European-Indo-Aryan"],
    ["ckb", "ckb_Arab", "Kurdish", "Indo-European-Other"],
    ["cym", "cym_Latn", "Welsh", "Indo-European-Other"],
    ["ell", "ell_Grek", "Greek", "Indo-European-Other"],
    ["fas", "pes_Arab", "Persian", "Indo-European-Other"],
    ["gle", "gle_Latn", "Irish", "Indo-European-Other"],
    ["hye", "hye_Armn", "Armenian", "Indo-European-Other"],
    ["ita", "ita_Latn", "Italian", "Indo-European-Other"],
    ["lav", "lvs_Latn", "Latvian", "Indo-European-Other"],
    ["lit", "lit_Latn", "Lithuanian", "Indo-European-Other"],
    ["pus", "pbt_Arab", "Pashto", "Indo-European-Other"],
    ["tgk", "tgk_Cyrl", "Tajik", "Indo-European-Other"],
    ["ceb", "ceb_Latn", "Cebuano", "Austronesian"],
    ["ind", "ind_Latn", "Indonesian", "Austronesian"],
    ["jav", "jav_Latn", "Javanese", "Austronesian"],
    ["mri", "mri_Latn", "Maori", "Austronesian"],
    ["msa", "zsm_Latn", "Malay", "Austronesian"],
    ["tgl", "tgl_Latn", "Tagalog", "Austronesian"],
    ["ibo", "ibo_Latn", "Igbo", "Atlantic-Congo"],
    ["kam", "kam_Latn", "Kamba", "Atlantic-Congo"],
    ["kea", "kea_Latn", "Kabuverdianu", "Atlantic-Congo"],
    ["lin", "lin_Latn", "Lingala", "Atlantic-Congo"],
    ["lug", "lug_Latn", "Luganda", "Atlantic-Congo"],
    ["nso", "nso_Latn", "Northern Sotho", "Atlantic-Congo"],
    ["nya", "nya_Latn", "Nyanja", "Atlantic-Congo"],
    ["sna", "sna_Latn", "Shona", "Atlantic-Congo"],
    ["swh", "swh_Latn", "Swahili", "Atlantic-Congo"],
    ["umb", "umb_Latn", "Umbundu", "Atlantic-Congo"],
    ["wol", "wol_Latn", "Wolof", "Atlantic-Congo"],
    ["xho", "xho_Latn", "Xhosa", "Atlantic-Congo"],
    ["yor", "yor_Latn", "Yoruba", "Atlantic-Congo"],
    ["zul", "zul_Latn", "Zulu", "Atlantic-Congo"],
    ["amh", "amh_Ethi", "Amharic", "Afro-Asiatic"],
    ["ara", "arb_Arab", "Arabic", "Afro-Asiatic"],
    ["ful", "fuv_Latn", "Fulah", "Afro-Asiatic"],
    ["mlt", "mlt_Latn", "Maltese", "Afro-Asiatic"],
    ["orm", "gaz_Latn", "Oromo", "Afro-Asiatic"],
    ["som", "som_Latn", "Somali", "Afro-Asiatic"],
    ["azj", "azj_Latn", "Azerbaijani", "Turkic"],
    ["kaz", "kaz_Cyrl", "Kazakh", "Turkic"],
    ["kir", "kir_Cyrl", "Kyrgyz", "Turkic"],
    ["tur", "tur_Latn", "Turkish", "Turkic"],
    ["uzb", "uzn_Latn", "Uzbek", "Turkic"],
    ["kan", "kan_Knda", "Kannada", "Dravidian"],
    ["mal", "mal_Mlym", "Malayalam", "Dravidian"],
    ["tam", "tam_Taml", "Tamil", "Dravidian"],
    ["tel", "tel_Telu", "Telugu", "Dravidian"],
    ["mya", "mya_Mymr", "Burmese", "Sino-Tibetan"],
    ["zho_simpl", "zho_Hans", "Chinese (Simpl)", "Sino-Tibetan"],
    ["zho_trad", "zho_Hant", "Chinese (Trad)", "Sino-Tibetan"],
    ["est", "est_Latn", "Estonian", "Other"],
    ["fin", "fin_Latn", "Finnish", "Other"],
    ["hau", "hau_Latn", "Hausa", "Other"],
    ["heb", "heb_Hebr", "Hebrew", "Other"],
    ["hun", "hun_Latn", "Hungarian", "Other"],
    ["jpn", "jpn_Jpan", "Japanese", "Other"],
    ["kat", "kat_Geor", "Georgian", "Other"],
    ["khm", "khm_Khmr", "Khmer", "Other"],
    ["kor", "kor_Hang", "Korean", "Other"],
    ["lao", "lao_Laoo", "Lao", "Other"],
    ["luo", "luo_Latn", "Luo", "Other"],
    ["mon", "khk_Cyrl", "Mongolian", "Other"],
    ["tha", "tha_Thai", "Thai", "Other"],
    ["vie", "vie_Latn", "Vietnamese", "Other"],
 ]
 flores_lang_map = {i[0]: i for i in _flores_lang_map}
 _flores_subtasks = [f"eng-{i}" for i in flores_lang_map if i != "eng"
                    ] + [f"{i}-eng" for i in flores_lang_map if i != "eng"]
 flores_datasets = []
 for _flores_subtask in _flores_subtasks:
    _src, _tgt = _flores_subtask.split("-")
    _, _flores_source, _src_inst, _ = flores_lang_map[_src]
    _, _flores_target, _tgt_inst, _ = flores_lang_map[_tgt]
    flores_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(
                begin="</E>",
                round=[
                    dict(
                        role="HUMAN",
                        prompt=
                        f"Translate the following {_src_inst} statements to {_tgt_inst}.\n{{sentence_{_flores_source}}}"
                    ),
                    dict(role="BOT", prompt=f"{{sentence_{_flores_target}}}"),
                ],
            ),
            ice_token="</E>",
        ),
        retriever=dict(type=TopkRetriever, ice_num=8),
        inferencer=dict(type=GenInferencer),
    )
    flores_eval_cfg = dict(
        evaluator=dict(type=BleuEvaluator),
        pred_role="BOT",
    )
    if _tgt == "zho_simpl":
        flores_eval_cfg["pred_postprocessor"] = dict(type="flores")
        flores_eval_cfg["dataset_postprocessor"] = dict(type="flores")
    flores_datasets.append(
        dict(
            type=FloresFirst100Dataset,
            abbr=f"flores_100_{_src}-{_tgt}",
            name=f"{_flores_source}-{_flores_target}",
            reader_cfg=dict(
                input_columns=f"sentence_{_flores_source}",
                output_column=f"sentence_{_flores_target}",
                train_split="dev",
                test_split="devtest"),
            infer_cfg=flores_infer_cfg.copy(),
            eval_cfg=flores_eval_cfg.copy(),
        ))
 del _flores_lang_map, _flores_subtask, _src, _tgt, _, _flores_source, _src_inst, _flores_target, _tgt_inst
--- a/configs/datasets/flores/flores_gen_aad4fd.py
+++ b/configs/datasets/flores/flores_gen_aad4fd.py
@ -1,154 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import TopkRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import BleuEvaluator
 from opencompass.datasets import FloresFirst100Dataset
 _flores_lang_map = [
    ["eng", "eng_Latn", "English", "Indo-European-Germanic"],
    ["afr", "afr_Latn", "Afrikaans", "Indo-European-Germanic"],
    ["dan", "dan_Latn", "Danish", "Indo-European-Germanic"],
    ["deu", "deu_Latn", "German", "Indo-European-Germanic"],
    ["isl", "isl_Latn", "Icelandic", "Indo-European-Germanic"],
    ["ltz", "ltz_Latn", "Luxembourgish", "Indo-European-Germanic"],
    ["nld", "nld_Latn", "Dutch", "Indo-European-Germanic"],
    ["nob", "nob_Latn", "Norwegian", "Indo-European-Germanic"],
    ["swe", "swe_Latn", "Swedish", "Indo-European-Germanic"],
    ["ast", "ast_Latn", "Asturian", "Indo-European-Romance"],
    ["cat", "cat_Latn", "Catalan", "Indo-European-Romance"],
    ["fra", "fra_Latn", "French", "Indo-European-Romance"],
    ["glg", "glg_Latn", "Galician", "Indo-European-Romance"],
    ["oci", "oci_Latn", "Occitan", "Indo-European-Romance"],
    ["por", "por_Latn", "Portuguese", "Indo-European-Romance"],
    ["ron", "ron_Latn", "Romanian", "Indo-European-Romance"],
    ["spa", "spa_Latn", "Spanish", "Indo-European-Romance"],
    ["bel", "bel_Cyrl", "Belarusian", "Indo-European-Slavic"],
    ["bos", "bos_Latn", "Bosnian", "Indo-European-Slavic"],
    ["bul", "bul_Cyrl", "Bulgarian", "Indo-European-Slavic"],
    ["ces", "ces_Latn", "Czech", "Indo-European-Slavic"],
    ["hrv", "hrv_Latn", "Croatian", "Indo-European-Slavic"],
    ["mkd", "mkd_Cyrl", "Macedonian", "Indo-European-Slavic"],
    ["pol", "pol_Latn", "Polish", "Indo-European-Slavic"],
    ["rus", "rus_Cyrl", "Russian", "Indo-European-Slavic"],
    ["slk", "slk_Latn", "Slovak", "Indo-European-Slavic"],
    ["slv", "slv_Latn", "Slovenian", "Indo-European-Slavic"],
    ["srp", "srp_Cyrl", "Serbian", "Indo-European-Slavic"],
    ["ukr", "ukr_Cyrl", "Ukrainian", "Indo-European-Slavic"],
    ["asm", "asm_Beng", "Assamese", "Indo-European-Indo-Aryan"],
    ["ben", "ben_Beng", "Bengali", "Indo-European-Indo-Aryan"],
    ["guj", "guj_Gujr", "Gujarati", "Indo-European-Indo-Aryan"],
    ["hin", "hin_Deva", "Hindi", "Indo-European-Indo-Aryan"],
    ["mar", "mar_Deva", "Marathi", "Indo-European-Indo-Aryan"],
    ["npi", "npi_Deva", "Nepali", "Indo-European-Indo-Aryan"],
    ["ory", "ory_Orya", "Oriya", "Indo-European-Indo-Aryan"],
    ["pan", "pan_Guru", "Punjabi", "Indo-European-Indo-Aryan"],
    ["snd", "snd_Arab", "Sindhi", "Indo-European-Indo-Aryan"],
    ["urd", "urd_Arab", "Urdu", "Indo-European-Indo-Aryan"],
    ["ckb", "ckb_Arab", "Kurdish", "Indo-European-Other"],
    ["cym", "cym_Latn", "Welsh", "Indo-European-Other"],
    ["ell", "ell_Grek", "Greek", "Indo-European-Other"],
    ["fas", "pes_Arab", "Persian", "Indo-European-Other"],
    ["gle", "gle_Latn", "Irish", "Indo-European-Other"],
    ["hye", "hye_Armn", "Armenian", "Indo-European-Other"],
    ["ita", "ita_Latn", "Italian", "Indo-European-Other"],
    ["lav", "lvs_Latn", "Latvian", "Indo-European-Other"],
    ["lit", "lit_Latn", "Lithuanian", "Indo-European-Other"],
    ["pus", "pbt_Arab", "Pashto", "Indo-European-Other"],
    ["tgk", "tgk_Cyrl", "Tajik", "Indo-European-Other"],
    ["ceb", "ceb_Latn", "Cebuano", "Austronesian"],
    ["ind", "ind_Latn", "Indonesian", "Austronesian"],
    ["jav", "jav_Latn", "Javanese", "Austronesian"],
    ["mri", "mri_Latn", "Maori", "Austronesian"],
    ["msa", "zsm_Latn", "Malay", "Austronesian"],
    ["tgl", "tgl_Latn", "Tagalog", "Austronesian"],
    ["ibo", "ibo_Latn", "Igbo", "Atlantic-Congo"],
    ["kam", "kam_Latn", "Kamba", "Atlantic-Congo"],
    ["kea", "kea_Latn", "Kabuverdianu", "Atlantic-Congo"],
    ["lin", "lin_Latn", "Lingala", "Atlantic-Congo"],
    ["lug", "lug_Latn", "Luganda", "Atlantic-Congo"],
    ["nso", "nso_Latn", "Northern Sotho", "Atlantic-Congo"],
    ["nya", "nya_Latn", "Nyanja", "Atlantic-Congo"],
    ["sna", "sna_Latn", "Shona", "Atlantic-Congo"],
    ["swh", "swh_Latn", "Swahili", "Atlantic-Congo"],
    ["umb", "umb_Latn", "Umbundu", "Atlantic-Congo"],
    ["wol", "wol_Latn", "Wolof", "Atlantic-Congo"],
    ["xho", "xho_Latn", "Xhosa", "Atlantic-Congo"],
    ["yor", "yor_Latn", "Yoruba", "Atlantic-Congo"],
    ["zul", "zul_Latn", "Zulu", "Atlantic-Congo"],
    ["amh", "amh_Ethi", "Amharic", "Afro-Asiatic"],
    ["ara", "arb_Arab", "Arabic", "Afro-Asiatic"],
    ["ful", "fuv_Latn", "Fulah", "Afro-Asiatic"],
    ["mlt", "mlt_Latn", "Maltese", "Afro-Asiatic"],
    ["orm", "gaz_Latn", "Oromo", "Afro-Asiatic"],
    ["som", "som_Latn", "Somali", "Afro-Asiatic"],
    ["azj", "azj_Latn", "Azerbaijani", "Turkic"],
    ["kaz", "kaz_Cyrl", "Kazakh", "Turkic"],
    ["kir", "kir_Cyrl", "Kyrgyz", "Turkic"],
    ["tur", "tur_Latn", "Turkish", "Turkic"],
    ["uzb", "uzn_Latn", "Uzbek", "Turkic"],
    ["kan", "kan_Knda", "Kannada", "Dravidian"],
    ["mal", "mal_Mlym", "Malayalam", "Dravidian"],
    ["tam", "tam_Taml", "Tamil", "Dravidian"],
    ["tel", "tel_Telu", "Telugu", "Dravidian"],
    ["mya", "mya_Mymr", "Burmese", "Sino-Tibetan"],
    ["zho_simpl", "zho_Hans", "Chinese (Simpl)", "Sino-Tibetan"],
    ["zho_trad", "zho_Hant", "Chinese (Trad)", "Sino-Tibetan"],
    ["est", "est_Latn", "Estonian", "Other"],
    ["fin", "fin_Latn", "Finnish", "Other"],
    ["hau", "hau_Latn", "Hausa", "Other"],
    ["heb", "heb_Hebr", "Hebrew", "Other"],
    ["hun", "hun_Latn", "Hungarian", "Other"],
    ["jpn", "jpn_Jpan", "Japanese", "Other"],
    ["kat", "kat_Geor", "Georgian", "Other"],
    ["khm", "khm_Khmr", "Khmer", "Other"],
    ["kor", "kor_Hang", "Korean", "Other"],
    ["lao", "lao_Laoo", "Lao", "Other"],
    ["luo", "luo_Latn", "Luo", "Other"],
    ["mon", "khk_Cyrl", "Mongolian", "Other"],
    ["tha", "tha_Thai", "Thai", "Other"],
    ["vie", "vie_Latn", "Vietnamese", "Other"],
 ]
 flores_lang_map = {i[0]: i for i in _flores_lang_map}
 _flores_subtasks = [f"eng-{i}" for i in flores_lang_map if i != "eng"
                    ] + [f"{i}-eng" for i in flores_lang_map if i != "eng"]
 flores_datasets = []
 for _flores_subtask in _flores_subtasks:
    _src, _tgt = _flores_subtask.split("-")
    _, _flores_source, _src_inst, _ = flores_lang_map[_src]
    _, _flores_target, _tgt_inst, _ = flores_lang_map[_tgt]
    flores_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=f"</E>{{sentence_{_flores_source}}} = {{sentence_{_flores_target}}}" if _flores_subtask != "zho_simpl-eng"
            else f"</E>Chinese: {{sentence_{_flores_source}}}\nEnglish: {{sentence_{_flores_target}}}",
            ice_token="</E>",
        ),
        retriever=dict(type=TopkRetriever, ice_num=8),
        inferencer=dict(type=GenInferencer),
    )
    flores_eval_cfg = dict(
        evaluator=dict(type=BleuEvaluator),
        pred_role="BOT",
        pred_postprocessor=dict(type="flores"),
        dataset_postprocessor=dict(type="flores"),
    )
    if _tgt == "zho_simpl":
        flores_eval_cfg["pred_postprocessor"] = dict(type="flores-chinese")
        flores_eval_cfg["dataset_postprocessor"] = dict(type="flores-chinese")
    flores_datasets.append(
        dict(
            type=FloresFirst100Dataset,
            abbr=f"flores_100_{_src}-{_tgt}",
            name=f"{_flores_source}-{_flores_target}",
            reader_cfg=dict(
                input_columns=f"sentence_{_flores_source}",
                output_column=f"sentence_{_flores_target}",
                train_split="dev",
                test_split="devtest"),
            infer_cfg=flores_infer_cfg.copy(),
            eval_cfg=flores_eval_cfg.copy(),
        ))
 del _flores_lang_map, _flores_subtask, _src, _tgt, _, _flores_source, _src_inst, _flores_target, _tgt_inst
--- a/configs/datasets/longbench/longbenchnq/longbench_nq_gen_d30cb9.py
+++ b/configs/datasets/longbench/longbenchnq/longbench_nq_gen_d30cb9.py
@ -1,38 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import LongBenchF1Evaluator, LongBenchnqDataset
 LongBench_nq_reader_cfg = dict(
    input_columns=['context', 'input'],
    output_column='answers',
    train_split='test',
    test_split='test'
 )
 LongBench_nq_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}'),
            ], )),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=32)
 )
 LongBench_nq_eval_cfg = dict(
    evaluator=dict(type=LongBenchF1Evaluator),
    pred_role='BOT'
 )
 LongBench_nq_datasets = [
    dict(
        type=LongBenchnqDataset,
        abbr='LongBench_nq',
        path='THUDM/LongBench',
        name='nq',
        reader_cfg=LongBench_nq_reader_cfg,
        infer_cfg=LongBench_nq_infer_cfg,
        eval_cfg=LongBench_nq_eval_cfg)
 ]
--- a/configs/datasets/math/math_gen.py
+++ b/configs/datasets/math/math_gen.py
@ -1,4 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .math_gen_265cce import math_datasets  # noqa: F401, F403
--- a/configs/datasets/math/math_gen_265cce.py
+++ b/configs/datasets/math/math_gen_265cce.py
@ -1,68 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
 math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[
            dict(
                role="HUMAN",
                prompt=
                "Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:"
            ),
            dict(
                role="BOT",
                prompt=
                "The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"
            ),
            dict(
                role="HUMAN",
                prompt=
                "Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:"
            ),
            dict(
                role="BOT",
                prompt=
                "We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"
            ),
            dict(
                role="HUMAN",
                prompt=
                "Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"
            ),
            dict(
                role="BOT",
                prompt=
                "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"
            ),
            dict(
                role="HUMAN",
                prompt=
                "Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:"
            ),
            dict(
                role="BOT",
                prompt=
                "If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n"
            ),
            dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"),
        ])),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512))
 math_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
 math_datasets = [
    dict(
        type=MATHDataset,
        abbr='math',
        path='./data/math/math.json',
        reader_cfg=math_reader_cfg,
        infer_cfg=math_infer_cfg,
        eval_cfg=math_eval_cfg)
 ]
--- a/configs/datasets/mbpp/mbpp_gen.py
+++ b/configs/datasets/mbpp/mbpp_gen.py
@ -1,4 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .mbpp_gen_1e1056 import mbpp_datasets  # noqa: F401, F403
--- a/configs/datasets/mbpp/mbpp_gen_5d6316.py
+++ b/configs/datasets/mbpp/mbpp_gen_5d6316.py
@ -1,42 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MBPPDataset, MBPPEvaluator2
 mbpp_reader_cfg = dict(
    input_columns=['text', 'test_list'], output_column='test_list_2')
 # This prompt is used for WizardLMCode series
 # You can use other config file for basic 3-shot generation
 mbpp_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[
            dict(
                role='HUMAN',
                prompt=
                """Below is an instruction that describes a task. Write a response that appropriately completes the request.
 ### Instruction:
 Create a Python script for this problem:
 {text}
 Test examples:
 {test_list}
 ### Response:"""),
        ])),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512))
 mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator2), pred_role="BOT")
 mbpp_datasets = [
    dict(
        type=MBPPDataset,
        abbr='mbpp',
        path='./data/mbpp/mbpp.jsonl',
        reader_cfg=mbpp_reader_cfg,
        infer_cfg=mbpp_infer_cfg,
        eval_cfg=mbpp_eval_cfg)
 ]
--- a/configs/datasets/mbpp/mbpp_gen_6590b0.py
+++ b/configs/datasets/mbpp/mbpp_gen_6590b0.py
@ -1,27 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MBPPDataset, MBPPEvaluator
 mbpp_reader_cfg = dict(
    input_columns=['text', 'test_list'], output_column='test_list_2')
 mbpp_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=
        "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n[BEGIN]\n"),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512))
 mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator))
 mbpp_datasets = [
    dict(
        type=MBPPDataset,
        abbr='mbpp',
        path='./data/mbpp/mbpp.jsonl',
        reader_cfg=mbpp_reader_cfg,
        infer_cfg=mbpp_infer_cfg,
        eval_cfg=mbpp_eval_cfg)
 ]
--- a/configs/datasets/mbpp/mbpp_gen_78c1bc.py
+++ b/configs/datasets/mbpp/mbpp_gen_78c1bc.py
@ -1,64 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MBPPDataset, MBPPEvaluator
 mbpp_reader_cfg = dict(
    input_columns=['text', 'test_list'], output_column='test_list_2')
 mbpp_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role="HUMAN",
                    prompt=
                    "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
                ),
                dict(
                    role="BOT",
                    prompt=
                    "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "
                ),
                dict(
                    role="HUMAN",
                    prompt=
                    "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"
                ),
                dict(
                    role="BOT",
                    prompt=
                    "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "
                ),
                dict(
                    role="HUMAN",
                    prompt=
                    "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
                ),
                dict(
                    role="BOT",
                    prompt=
                    "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "
                ),
                dict(
                    role="HUMAN",
                    prompt=
                    "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"
                ),
                dict(role="BOT", prompt="[BEGIN]\n"),
            ], )),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer))
 mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
 mbpp_datasets = [
    dict(
        type=MBPPDataset,
        abbr='mbpp',
        path='./data/mbpp/mbpp.jsonl',
        reader_cfg=mbpp_reader_cfg,
        infer_cfg=mbpp_infer_cfg,
        eval_cfg=mbpp_eval_cfg)
 ]
--- a/configs/datasets/mmlu/mmlu_gen.py
+++ b/configs/datasets/mmlu/mmlu_gen.py
@ -1,4 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .mmlu_gen_a484b3 import mmlu_datasets  # noqa: F401, F403
--- a/configs/datasets/mmlu/mmlu_gen_5d1409.py
+++ b/configs/datasets/mmlu/mmlu_gen_5d1409.py
@ -1,124 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import MMLUDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
 # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
 mmlu_reader_cfg = dict(
    input_columns=["input", "A", "B", "C", "D"],
    output_column="target",
    train_split='dev')
 mmlu_all_sets = [
    "college_biology",
    "college_chemistry",
    "college_computer_science",
    "college_mathematics",
    "college_physics",
    "electrical_engineering",
    "astronomy",
    "anatomy",
    "abstract_algebra",
    "machine_learning",
    "clinical_knowledge",
    "global_facts",
    "management",
    "nutrition",
    "marketing",
    "professional_accounting",
    "high_school_geography",
    "international_law",
    "moral_scenarios",
    "computer_security",
    "high_school_microeconomics",
    "professional_law",
    "medical_genetics",
    "professional_psychology",
    "jurisprudence",
    "world_religions",
    "philosophy",
    "virology",
    "high_school_chemistry",
    "public_relations",
    "high_school_macroeconomics",
    "human_sexuality",
    "elementary_mathematics",
    "high_school_physics",
    "high_school_computer_science",
    "high_school_european_history",
    "business_ethics",
    "moral_disputes",
    "high_school_statistics",
    "miscellaneous",
    "formal_logic",
    "high_school_government_and_politics",
    "prehistory",
    "security_studies",
    "high_school_biology",
    "logical_fallacies",
    "high_school_world_history",
    "professional_medicine",
    "high_school_mathematics",
    "college_medicine",
    "high_school_us_history",
    "sociology",
    "econometrics",
    "high_school_psychology",
    "human_aging",
    "us_foreign_policy",
    "conceptual_physics",
 ]
 mmlu_datasets = []
 for _name in mmlu_all_sets:
    _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
    mmlu_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role="HUMAN",
                    prompt=
                    f"{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: "
                ),
                dict(role="BOT", prompt="{target}\n")
            ]),
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin="</E>",
                round=[
                    dict(
                        role="HUMAN",
                        prompt=
                        f"{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: "
                    ),
                ],
            ),
            ice_token="</E>",
        ),
        retriever=dict(type=FixKRetriever),
        inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]),
    )
    mmlu_eval_cfg = dict(
        evaluator=dict(type=AccEvaluator),
        pred_postprocessor=dict(type=first_capital_postprocess))
    mmlu_datasets.append(
        dict(
            abbr=f"lukaemon_mmlu_{_name}",
            type=MMLUDataset,
            path="./data/mmlu/",
            name=_name,
            reader_cfg=mmlu_reader_cfg,
            infer_cfg=mmlu_infer_cfg,
            eval_cfg=mmlu_eval_cfg,
        ))
 del _name, _hint
--- a/configs/datasets/mmlu/mmlu_gen_79e572.py
+++ b/configs/datasets/mmlu/mmlu_gen_79e572.py
@ -1,110 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import MMLUDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
 # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
 mmlu_reader_cfg = dict(
    input_columns=["input", "A", "B", "C", "D"],
    output_column="target",
    train_split='dev')
 mmlu_all_sets = [
    "college_biology",
    "college_chemistry",
    "college_computer_science",
    "college_mathematics",
    "college_physics",
    "electrical_engineering",
    "astronomy",
    "anatomy",
    "abstract_algebra",
    "machine_learning",
    "clinical_knowledge",
    "global_facts",
    "management",
    "nutrition",
    "marketing",
    "professional_accounting",
    "high_school_geography",
    "international_law",
    "moral_scenarios",
    "computer_security",
    "high_school_microeconomics",
    "professional_law",
    "medical_genetics",
    "professional_psychology",
    "jurisprudence",
    "world_religions",
    "philosophy",
    "virology",
    "high_school_chemistry",
    "public_relations",
    "high_school_macroeconomics",
    "human_sexuality",
    "elementary_mathematics",
    "high_school_physics",
    "high_school_computer_science",
    "high_school_european_history",
    "business_ethics",
    "moral_disputes",
    "high_school_statistics",
    "miscellaneous",
    "formal_logic",
    "high_school_government_and_politics",
    "prehistory",
    "security_studies",
    "high_school_biology",
    "logical_fallacies",
    "high_school_world_history",
    "professional_medicine",
    "high_school_mathematics",
    "college_medicine",
    "high_school_us_history",
    "sociology",
    "econometrics",
    "high_school_psychology",
    "human_aging",
    "us_foreign_policy",
    "conceptual_physics",
 ]
 mmlu_datasets = []
 for _name in mmlu_all_sets:
    _hint = f'The following are multiple choice questions (with answers) about  {_name.replace("_", " ")}.\n\n'
    mmlu_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=
            "{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: {target}\n",
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template=
            f"{_hint}</E>{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:",
            ice_token="</E>",
        ),
        retriever=dict(type=FixKRetriever),
        inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]),
    )
    mmlu_eval_cfg = dict(
        evaluator=dict(type=AccEvaluator),
        pred_postprocessor=dict(type=first_capital_postprocess),
    )
    mmlu_datasets.append(
        dict(
            abbr=f"lukaemon_mmlu_{_name}",
            type=MMLUDataset,
            path="./data/mmlu/",
            name=_name,
            reader_cfg=mmlu_reader_cfg,
            infer_cfg=mmlu_infer_cfg,
            eval_cfg=mmlu_eval_cfg,
        ))
 del _name, _hint
--- a/configs/datasets/mmlu/mmlu_gen_a484b3.py
+++ b/configs/datasets/mmlu/mmlu_gen_a484b3.py
@ -1,124 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import MMLUDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
 # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
 mmlu_reader_cfg = dict(
    input_columns=["input", "A", "B", "C", "D"],
    output_column="target",
    train_split='dev')
 mmlu_all_sets = [
    "college_biology",
    "college_chemistry",
    "college_computer_science",
    "college_mathematics",
    "college_physics",
    "electrical_engineering",
    "astronomy",
    "anatomy",
    "abstract_algebra",
    "machine_learning",
    "clinical_knowledge",
    "global_facts",
    "management",
    "nutrition",
    "marketing",
    "professional_accounting",
    "high_school_geography",
    "international_law",
    "moral_scenarios",
    "computer_security",
    "high_school_microeconomics",
    "professional_law",
    "medical_genetics",
    "professional_psychology",
    "jurisprudence",
    "world_religions",
    "philosophy",
    "virology",
    "high_school_chemistry",
    "public_relations",
    "high_school_macroeconomics",
    "human_sexuality",
    "elementary_mathematics",
    "high_school_physics",
    "high_school_computer_science",
    "high_school_european_history",
    "business_ethics",
    "moral_disputes",
    "high_school_statistics",
    "miscellaneous",
    "formal_logic",
    "high_school_government_and_politics",
    "prehistory",
    "security_studies",
    "high_school_biology",
    "logical_fallacies",
    "high_school_world_history",
    "professional_medicine",
    "high_school_mathematics",
    "college_medicine",
    "high_school_us_history",
    "sociology",
    "econometrics",
    "high_school_psychology",
    "human_aging",
    "us_foreign_policy",
    "conceptual_physics",
 ]
 mmlu_datasets = []
 for _name in mmlu_all_sets:
    _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
    mmlu_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role="HUMAN",
                    prompt=
                    f"{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: "
                ),
                dict(role="BOT", prompt="{target}\n")
            ]),
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin="</E>",
                round=[
                    dict(
                        role="HUMAN",
                        prompt=
                        f"{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: "
                    ),
                ],
            ),
            ice_token="</E>",
        ),
        retriever=dict(type=FixKRetriever),
        inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]),
    )
    mmlu_eval_cfg = dict(
        evaluator=dict(type=AccEvaluator),
        pred_postprocessor=dict(type=first_capital_postprocess))
    mmlu_datasets.append(
        dict(
            abbr=f"lukaemon_mmlu_{_name}",
            type=MMLUDataset,
            path="./data/mmlu/",
            name=_name,
            reader_cfg=mmlu_reader_cfg,
            infer_cfg=mmlu_infer_cfg,
            eval_cfg=mmlu_eval_cfg,
        ))
 del _name, _hint
--- a/configs/datasets/mmlu/mmlu_ppl_ac766d.py
+++ b/configs/datasets/mmlu/mmlu_ppl_ac766d.py
@ -1,113 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import MMLUDataset
 # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
 # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
 mmlu_reader_cfg = dict(
    input_columns=["input", "A", "B", "C", "D"],
    output_column="target",
    train_split='dev')
 mmlu_all_sets = [
    "college_biology",
    "college_chemistry",
    "college_computer_science",
    "college_mathematics",
    "college_physics",
    "electrical_engineering",
    "astronomy",
    "anatomy",
    "abstract_algebra",
    "machine_learning",
    "clinical_knowledge",
    "global_facts",
    "management",
    "nutrition",
    "marketing",
    "professional_accounting",
    "high_school_geography",
    "international_law",
    "moral_scenarios",
    "computer_security",
    "high_school_microeconomics",
    "professional_law",
    "medical_genetics",
    "professional_psychology",
    "jurisprudence",
    "world_religions",
    "philosophy",
    "virology",
    "high_school_chemistry",
    "public_relations",
    "high_school_macroeconomics",
    "human_sexuality",
    "elementary_mathematics",
    "high_school_physics",
    "high_school_computer_science",
    "high_school_european_history",
    "business_ethics",
    "moral_disputes",
    "high_school_statistics",
    "miscellaneous",
    "formal_logic",
    "high_school_government_and_politics",
    "prehistory",
    "security_studies",
    "high_school_biology",
    "logical_fallacies",
    "high_school_world_history",
    "professional_medicine",
    "high_school_mathematics",
    "college_medicine",
    "high_school_us_history",
    "sociology",
    "econometrics",
    "high_school_psychology",
    "human_aging",
    "us_foreign_policy",
    "conceptual_physics",
 ]
 mmlu_datasets = []
 for _name in mmlu_all_sets:
    _hint = f'The following are multiple choice questions (with answers) about  {_name.replace("_", " ")}.\n\n'
    mmlu_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template={
                opt:
                f"{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}\n"
                for opt in ["A", "B", "C", "D"]
            },
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template={
                opt:
                f"{_hint}</E>{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}"
                for opt in ["A", "B", "C", "D"]
            },
            ice_token="</E>",
        ),
        retriever=dict(type=FixKRetriever),
        inferencer=dict(type=PPLInferencer, fix_id_list=[0, 1, 2, 3, 4]),
    )
    mmlu_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
    mmlu_datasets.append(
        dict(
            abbr=f"lukaemon_mmlu_{_name}",
            type=MMLUDataset,
            path="./data/mmlu/",
            name=_name,
            reader_cfg=mmlu_reader_cfg,
            infer_cfg=mmlu_infer_cfg,
            eval_cfg=mmlu_eval_cfg,
        ))
 del _name, _hint
--- a/configs/datasets/race/race_ppl_abed12.py
+++ b/configs/datasets/race/race_ppl_abed12.py
@ -1,46 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import RaceDataset
 race_reader_cfg = dict(
    input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
    output_column='answer')
 race_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template={
            'A':
            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: A',
            'B':
            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: B',
            'C':
            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: C',
            'D':
            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: D',
        }),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=PPLInferencer))
 race_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
 race_datasets = [
    dict(
        type=RaceDataset,
        abbr='race-middle',
        path='race',
        name='middle',
        reader_cfg=race_reader_cfg,
        infer_cfg=race_infer_cfg,
        eval_cfg=race_eval_cfg),
    dict(
        type=RaceDataset,
        abbr='race-high',
        path='race',
        name='high',
        reader_cfg=race_reader_cfg,
        infer_cfg=race_infer_cfg,
        eval_cfg=race_eval_cfg)
 ]
--- a/configs/datasets/tydiqa/tydiqa_gen_978d2a.py
+++ b/configs/datasets/tydiqa/tydiqa_gen_978d2a.py
@ -1,51 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import TydiQADataset, TydiQAEvaluator
 # All configs are for TydiQA Goldp task
 tydiqa_reader_cfg = dict(
    input_columns=["passage_text", "question_text"],
    output_column="answer",
    test_split='validation',
    train_split='validation',)
 langs = ['arabic', 'bengali', 'english', 'finnish', 'indonesian', 'japanese', 'korean', 'russian', 'swahili', 'telugu', 'thai']
 prefixs_prompt = {
    "english": ("Answer the following question based on the information in the given passage.", "Passage:", "Question:", "Answer:"),
    "arabic": ("أجب على السؤال التالي بناءً على المعلومات في المقطع المعطى.", "المقطع:", "السؤال:", "الإجابة:"),
    "bengali": ("প্রদত্ত অধ্যায়ের তথ্যের উপর ভিত্তি করে নিম্নলিখিত প্রশ্নের উত্তর দিন।", "অধ্যায়:", "প্রশ্ন:", "উত্তর:"),
    "finnish": ("Vastaa seuraavaan kysymykseen annetun kappaleen tiedon perusteella.", "Kappale:", "Kysymys:", "Vastaus:"),
    "indonesian": ("Jawab pertanyaan berikut berdasarkan informasi di bagian yang diberikan.", "Bagian:", "Pertanyaan:", "Jawaban:"),
    "korean": ("주어진 문단의 정보에 기반하여 다음 질문에 답하십시오.", "문단:", "질문:", "답변:"),
    "japanese":("文脈に基づいて質問に答えてください。","ぶんしょう:","しつもん:", "かいとう:"),
    "russian": ("Ответьте на следующий вопрос на основе информации в данном отрывке.", "Отрывок:", "Вопрос:", "Ответ:"),
    "swahili": ("Jibu swali lifuatalo kulingana na habari kwenye kifungu kilichotolewa.", "Kifungu:", "Swali:", "Jibu:"),
    "telugu": ("ఇచ్చిన పేరాలోని సమాచారం ఆధారంగా కింది ప్రశ్నకు సమాధానం ఇవ్వండి.", "పేరా:", "ప్రశ్న:", "సమాధానం:"),
    "thai":("ตอบคำถามต่อไปนี้โดยอิงตามข้อมูลในตอนข้อความที่กำหนด:", "ตอนข้อความ:", "คำถาม:", "คำตอบ:")
 }
 tydiqa_datasets = []
 for _lang in langs:
    _hint = prefixs_prompt[_lang]
    tydiqa_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=f"{_hint[0]}\n\n</E>{_hint[1]}{{passage_text}}\n{_hint[2]} {{question_text}}\n{_hint[3]} {{answer}}" ,
            ice_token='</E>'),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer), max_out_len=50)
    tydiqa_eval_cfg = dict(evaluator=dict(type=TydiQAEvaluator),
                        ds_split='validation',
                        ds_column='answer',
                        )
    tydiqa_datasets.append(
    dict(abbr=f'tyidqa-goldp_{_lang}',
        type=TydiQADataset,
        path='khalidalt/tydiqa-goldp',
        name=_lang,
        reader_cfg=tydiqa_reader_cfg,
        infer_cfg=tydiqa_infer_cfg,
        eval_cfg=tydiqa_eval_cfg))
--- a/configs/datasets/z_bench/z_bench_gen_52ba2f.py
+++ b/configs/datasets/z_bench/z_bench_gen_52ba2f.py
@ -1,25 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import HFDataset
 z_bench_reader_cfg = dict(
    input_columns=['text'], output_column='category', train_split='test')
 z_bench_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template='{text}',
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer))
 z_bench_datasets = dict(
    type=HFDataset,
    path=
    '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
    data_dir=
    '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
    name='question',
    reader_cfg=z_bench_reader_cfg,
    infer_cfg=z_bench_infer_cfg)
--- a/configs/datasets/z_bench/z_bench_gen_d8c84c.py
+++ b/configs/datasets/z_bench/z_bench_gen_d8c84c.py
@ -1,28 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import HFDataset
 z_bench_reader_cfg = dict(
    ds_size=4,
    input_columns=['text'],
    output_column='category',
    train_split='test')
 z_bench_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[dict(role="HUMAN", prompt="{text}")]),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer))
 z_bench_datasets = dict(
    type=HFDataset,
    path=
    '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
    data_dir=
    '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
    name='question',
    reader_cfg=z_bench_reader_cfg,
    infer_cfg=z_bench_infer_cfg)
--- a/configs/eval_codegeex2.py
+++ b/configs/eval_codegeex2.py
@ -1,7 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .datasets.humanevalx.humanevalx_gen import humanevalx_datasets
    from .models.codegeex2.hf_codegeex2_6b import models
 datasets = humanevalx_datasets
--- a/configs/eval_demo.py
+++ b/configs/eval_demo.py
@ -1,10 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .datasets.siqa.siqa_gen import siqa_datasets
    from .datasets.winograd.winograd_ppl import winograd_datasets
    from .models.opt.hf_opt_125m import opt125m
    from .models.opt.hf_opt_350m import opt350m
 datasets = [*siqa_datasets, *winograd_datasets]
 models = [opt125m, opt350m]
--- a/configs/eval_gpt3.5.py
+++ b/configs/eval_gpt3.5.py
@ -1,36 +0,0 @@
 from mmengine.config import read_base
 from opencompass.models import OpenAI
 from opencompass.partitioners import NaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask
 with read_base():
    # choose a list of datasets
    from .datasets.collections.chat_medium import datasets
    # and output the results in a choosen format
    from .summarizers.medium import summarizer
 api_meta_template = dict(
    round=[
            dict(role='HUMAN', api_role='HUMAN'),
            dict(role='BOT', api_role='BOT', generate=True),
    ],
 )
 models = [
    dict(abbr='GPT-3.5-turbo-0613',
        type=OpenAI, path='gpt-3.5-turbo-0613',
        key='ENV',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
        meta_template=api_meta_template,
        query_per_second=1,
        max_out_len=2048, max_seq_len=4096, batch_size=8),
 ]
 infer = dict(
    partitioner=dict(type=NaivePartitioner),
    runner=dict(
        type=LocalRunner,
        max_num_workers=8,
        task=dict(type=OpenICLInferTask)),
 )
--- a/configs/eval_gpt4.py
+++ b/configs/eval_gpt4.py
@ -1,40 +0,0 @@
 from mmengine.config import read_base
 from opencompass.models import OpenAI
 from opencompass.partitioners import NaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask
 with read_base():
    from .datasets.collections.chat_medium import datasets
    from .summarizers.medium import summarizer
 # GPT4 needs a special humaneval postprocessor
 from opencompass.datasets.humaneval import humaneval_gpt_postprocess
 for _dataset in datasets:
    if _dataset['path'] == 'openai_humaneval':
        _dataset['eval_cfg']['pred_postprocessor']['type'] = humaneval_gpt_postprocess
 api_meta_template = dict(
    round=[
            dict(role='HUMAN', api_role='HUMAN'),
            dict(role='BOT', api_role='BOT', generate=True),
    ],
 )
 models = [
    dict(abbr='GPT4',
        type=OpenAI, path='gpt-4-0613',
        key='ENV',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
        meta_template=api_meta_template,
        query_per_second=1,
        max_out_len=2048, max_seq_len=2048, batch_size=8),
 ]
 infer = dict(
    partitioner=dict(type=NaivePartitioner),
    runner=dict(
        type=LocalRunner,
        max_num_workers=4,
        task=dict(type=OpenICLInferTask)),
 )
--- a/configs/eval_hf_llama_7b.py
+++ b/configs/eval_hf_llama_7b.py
@ -1,8 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .datasets.collections.base_medium_llama import piqa_datasets, siqa_datasets
    from .models.hf_llama.hf_llama_7b import models
 datasets = [*piqa_datasets, *siqa_datasets]
--- a/configs/eval_internLM.py
+++ b/configs/eval_internLM.py
@ -1,9 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    # choose a list of datasets
    from .datasets.collections.base_medium import datasets
    # choose a model of interest
    from .models.internlm.internlm_7b import models
    # and output the results in a choosen format
    from .summarizers.medium import summarizer
--- a/configs/eval_internlm_7b.py
+++ b/configs/eval_internlm_7b.py
@ -1,9 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    # choose a list of datasets
    from .datasets.collections.base_medium import datasets
    # choose a model of interest
    from .models.hf_internlm.hf_internlm_7b import models
    # and output the results in a choosen format
    from .summarizers.medium import summarizer
--- a/configs/eval_internlm_chat_7b_turbomind.py
+++ b/configs/eval_internlm_chat_7b_turbomind.py
@ -1,32 +0,0 @@
 from mmengine.config import read_base
 from opencompass.models.turbomind import TurboMindModel
 with read_base():
    # choose a list of datasets
    from .datasets.SuperGLUE_CB.SuperGLUE_CB_gen import CB_datasets
    # and output the results in a choosen format
    from .summarizers.medium import summarizer
 datasets = [*CB_datasets]
 _meta_template = dict(
    round=[
        dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
        dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
    ],
 )
 models = [
    dict(
        type=TurboMindModel,
        abbr='internlm-chat-7b-tb',
        path="internlm-chat-7b",
        model_path='./workspace',
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/eval_llama2_7b.py
+++ b/configs/eval_llama2_7b.py
@ -1,8 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .datasets.collections.base_medium_llama import piqa_datasets, siqa_datasets
    from .models.llama.llama2_7b import models
 datasets = [*piqa_datasets, *siqa_datasets]
--- a/configs/eval_openai_agent.py
+++ b/configs/eval_openai_agent.py
@ -1,148 +0,0 @@
 from mmengine.config import read_base
 from opencompass.partitioners import SizePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.openicl import AgentInferencer
 with read_base():
    from .summarizers.medium import summarizer
    from .datasets.gsm8k.gsm8k_gen import gsm8k_datasets as datasets
 from opencompass.models.lagent import LagentAgent
 from lagent.llms import GPTAPI
 from lagent.agents.react import ReAct, ReActProtocol
 from lagent.actions import PythonInterpreter
 FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
 FEWSHOT_INSTRUCTION = """\
 You are a assistant who can utilize external tools.
 {tool_description}
 To use a tool, please use the following format:
 ```
 {thought} Think what you need to solve, do you need to use tools?
 {action} the tool name, should be one of [{action_names}]
 {action_input} the input to the action
 ```
 I will give you response after utilizing tools should using the following format:
 ```
 {response} the results after call the tool.
 ``
 If you already know the answer, or you do not need to use tools,
 please using the following format to reply:
 ```
 {thought} the thought process to get the final answer
 {finish} final answer
 ```
 Examples:
 <HUMAN>A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?
 <ASSISTANT>{thought} We need to calculate the total number of fruits. The total number of fruits in the first three baskets is given, while for the fourth basket, we need to subtract 2 from each fruit category. We can solve this problem using simple arithmetic.
 {action} PythonInterpreter
 {action_input}
 ```python
 def solution():
    # Fruits in the first three baskets
    apples_first_three = 9
    oranges_first_three = 15
    bananas_first_three = 14
    # Fruits in the fourth basket
    apples_fourth = apples_first_three - 2
    oranges_fourth = oranges_first_three - 2
    bananas_fourth = bananas_first_three - 2
    # Total fruits
    total_fruits = ((apples_first_three + oranges_first_three + bananas_first_three) * 3 +
                    apples_fourth + oranges_fourth + bananas_fourth)
    return {{"total_fruits": total_fruits}}
 ```
 <SYSTEM>{response}{{'total_fruits': 146}}
 <ASSISTANT> {thought} By adding the given numbers of apples, oranges, and bananas in the first three baskets, then subtracting 2 from each category for the fourth basket, we have found the total number of fruits.
 {finish} 146
 <HUMAN>Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?
 <ASSISTANT>{thought} This is a problem that requires solving equations. We know the relationship between the number of marbles, frisbees, and deck cards. Bella has twice as many marbles as frisbees, and 20 more frisbees than deck cards. Finally, we are told Bella buys 2/5 times more of each item. This purchasing will increase the number of each type of item.
 {action} PythonInterpreter
 {action_input}
 ```python
 def solution():
    # Given number of marbles
    marbles_now = 60
    # Calculate number of frisbees and deck cards now
    frisbees_now = marbles_now / 2
    cards_now = frisbees_now - 20
    # Calculate number of each item after buying more
    marbles_then = marbles_now + (2/5) * marbles_now
    frisbees_then = frisbees_now + (2/5) * frisbees_now
    cards_then = cards_now + (2/5)*cards_now
    # Total number of items then
    total_items = marbles_then + frisbees_then + cards_then
    return {{"total_items": total_items}}
 ```
 <SYSTEM>{response}{{'total_items': 140.0}}
 <ASSISTANT>{thought} By establishing the relationships between the numbers of marbles, frisbees, and deck cards that Bella currently has, we can calculate how many of each item she will have after buying 2/5 more of each. Adding these quantities together gives us the total number of items.
 {finish} 140
 Begin!
 """
 PYTHON_INTERPRETER_DESCRIPTION = '''\
 It can run a Python code. The code must be a valid code that contains only python method, and the method' name must be 'solution' and returns a dict, which key is variable name. The libraries I recommend are sympy and scipy. the format is:
 ```python
 # import packages
 import xxx
 def solution():
    # initialize some variables
    variable_names_with_real_meaning = xxx
    # middle steps
    mid_variable = func(mid_variable)
    # final answer
    final_answer =  func(mid_variable)
    return final_answer
 ```'''
 models = [
    dict(abbr='gpt-3.5-react',
         type=LagentAgent,
         agent_type=ReAct,
         max_turn=3,
         llm=dict(
             type=GPTAPI,
             model_type='gpt-3.5-turbo',
             key='ENV',
             query_per_second=1,
             max_seq_len=4096,
         ),
         actions=[
             dict(type=PythonInterpreter,
                  description=PYTHON_INTERPRETER_DESCRIPTION),
         ],
         protocol=dict(
             type=ReActProtocol,
             call_protocol=FEWSHOT_INSTRUCTION,
             force_stop=FORCE_STOP_PROMPT_EN,
             finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
         ),
         batch_size=8),
 ]
 for dataset in datasets:
    # Use AgentInferencer instead of GenInferencer
    dataset['infer_cfg']['inferencer'] = dict(type=AgentInferencer)
    # Use the question as agent input directly.
    dataset['infer_cfg']['prompt_template']['template'] = "{question}"
 infer = dict(
    partitioner=dict(type=SizePartitioner, max_task_size=1000),
    runner=dict(
        type=LocalRunner,
        max_num_workers=16,
        task=dict(type=OpenICLInferTask)),
 )
--- a/configs/models/codellama/hf_codellama_13b.py
+++ b/configs/models/codellama/hf_codellama_13b.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    # CodeLlama 13B
    dict(
        type=HuggingFaceCausalLM,
        abbr='CodeLlama-13b',
        path="codellama/CodeLlama-13b-hf",
        tokenizer_path='codellama/CodeLlama-13b-hf',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
        ),
        max_out_len=1024,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
        run_cfg=dict(num_gpus=2, num_procs=1),
    ),
 ]
--- a/configs/models/codellama/hf_codellama_13b_instruct.py
+++ b/configs/models/codellama/hf_codellama_13b_instruct.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    # CodeLlama 13B Instruct
    dict(
        type=HuggingFaceCausalLM,
        abbr='CodeLlama-13b-Instruct',
        path="codellama/CodeLlama-13b-Instruct-hf",
        tokenizer_path='codellama/CodeLlama-13b-Instruct-hf',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
        ),
        max_out_len=1024,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
        run_cfg=dict(num_gpus=2, num_procs=1),
    ),
 ]
--- a/configs/models/codellama/hf_codellama_13b_python.py
+++ b/configs/models/codellama/hf_codellama_13b_python.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    # CodeLlama 13B Python
    dict(
        type=HuggingFaceCausalLM,
        abbr='CodeLlama-13b-Python',
        path="codellama/CodeLlama-13b-Python-hf",
        tokenizer_path='codellama/CodeLlama-13b-Python-hf',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
        ),
        max_out_len=1024,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
        run_cfg=dict(num_gpus=2, num_procs=1),
    ),
 ]
--- a/configs/models/codellama/hf_codellama_34b_instruct.py
+++ b/configs/models/codellama/hf_codellama_34b_instruct.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    # CodeLlama 34B Instruct
    dict(
        type=HuggingFaceCausalLM,
        abbr='CodeLlama-34b-Instruct',
        path="codellama/CodeLlama-34b-Instruct-hf",
        tokenizer_path='codellama/CodeLlama-34b-Instruct-hf',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
        ),
        max_out_len=1024,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
        run_cfg=dict(num_gpus=4, num_procs=1),
    ),
 ]
--- a/configs/models/codellama/hf_codellama_7b.py
+++ b/configs/models/codellama/hf_codellama_7b.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    # CodeLlama 7B
    dict(
        type=HuggingFaceCausalLM,
        abbr='CodeLlama-7b',
        path="codellama/CodeLlama-7b-hf",
        tokenizer_path='codellama/CodeLlama-7b-hf',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
        ),
        max_out_len=1024,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
        run_cfg=dict(num_gpus=1, num_procs=1),
    ),
 ]
--- a/configs/models/codellama/hf_codellama_7b_instruct.py
+++ b/configs/models/codellama/hf_codellama_7b_instruct.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    # CodeLlama 7B Instruct
    dict(
        type=HuggingFaceCausalLM,
        abbr='CodeLlama-7b-Instruct',
        path="codellama/CodeLlama-7b-Instruct-hf",
        tokenizer_path='codellama/CodeLlama-7b-Instruct-hf',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
        ),
        max_out_len=1024,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
        run_cfg=dict(num_gpus=1, num_procs=1),
    ),
 ]
--- a/configs/models/codellama/hf_codellama_7b_python.py
+++ b/configs/models/codellama/hf_codellama_7b_python.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    # CodeLlama 7B Python
    dict(
        type=HuggingFaceCausalLM,
        abbr='CodeLlama-7b-Python',
        path="codellama/CodeLlama-7b-Python-hf",
        tokenizer_path='codellama/CodeLlama-7b-Python-hf',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
        ),
        max_out_len=1024,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
        run_cfg=dict(num_gpus=1, num_procs=1),
    ),
 ]
--- a/configs/models/falcon/hf_falcon_7b.py
+++ b/configs/models/falcon/hf_falcon_7b.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='falcon-7b-hf',
        path='tiiuae/falcon-7b',
        tokenizer_path='tiiuae/falcon-7b',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
        ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(trust_remote_code=True, device_map='auto', revision='2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5'),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/hf_internlm/hf_internlm_7b.py
+++ b/configs/models/hf_internlm/hf_internlm_7b.py
@ -1,22 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='internlm-7b-hf',
        path="internlm/internlm-7b",
        tokenizer_path='internlm/internlm-7b',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            use_fast=False,
            trust_remote_code=True,
        ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/hf_llama/hf_llama2_13b.py
+++ b/configs/models/hf_llama/hf_llama2_13b.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='llama-2-13b-hf',
        path="meta-llama/Llama-2-13b-hf",
        tokenizer_path='meta-llama/Llama-2-13b-hf',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              use_fast=False,
                              ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=2, num_procs=1),
    )
 ]
--- a/configs/models/hf_llama/hf_llama2_70b.py
+++ b/configs/models/hf_llama/hf_llama2_70b.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='llama-2-70b-hf',
        path="meta-llama/Llama-2-70b-hf",
        tokenizer_path='meta-llama/Llama-2-70b-hf',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              use_fast=False,
                              ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=8, num_procs=1),
    )
 ]
--- a/configs/models/hf_llama/hf_llama2_7b.py
+++ b/configs/models/hf_llama/hf_llama2_7b.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='llama-2-7b-hf',
        path="meta-llama/Llama-2-7b-hf",
        tokenizer_path='meta-llama/Llama-2-7b-hf',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              use_fast=False,
                              ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/hf_llama/hf_llama_13b.py
+++ b/configs/models/hf_llama/hf_llama_13b.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    # LLaMA 13B
    dict(
        type=HuggingFaceCausalLM,
        abbr='llama-13b-hf',
        path="huggyllama/llama-13b",
        tokenizer_path='huggyllama/llama-13b',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              use_fast=False,
                              ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=2, num_procs=1),
    )
 ]
--- a/configs/models/hf_llama/hf_llama_30b.py
+++ b/configs/models/hf_llama/hf_llama_30b.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    # LLaMA 30B
    dict(
        type=HuggingFaceCausalLM,
        abbr='llama-30b-hf',
        path="huggyllama/llama-30b",
        tokenizer_path='huggyllama/llama-30b',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              use_fast=False,
                              ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=4, num_procs=1),
    )
 ]
--- a/configs/models/hf_llama/hf_llama_65b.py
+++ b/configs/models/hf_llama/hf_llama_65b.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    # LLaMA 65B
    dict(
        type=HuggingFaceCausalLM,
        abbr='llama-65b-hf',
        path="huggyllama/llama-65b",
        tokenizer_path='huggyllama/llama-65b',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              use_fast=False,
                              ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=8, num_procs=1),
    )
 ]
--- a/configs/models/hf_llama/hf_llama_7b.py
+++ b/configs/models/hf_llama/hf_llama_7b.py
@ -1,21 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    # LLaMA 7B
    dict(
        type=HuggingFaceCausalLM,
        abbr='llama-7b-hf',
        path="huggyllama/llama-7b",
        tokenizer_path='huggyllama/llama-7b',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              use_fast=False,
                              ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/llama/llama2_13b.py
+++ b/configs/models/llama/llama2_13b.py
@ -1,23 +0,0 @@
 from opencompass.models import Llama2
 # Please follow the instruction in the Meta AI website https://github.com/facebookresearch/llama
 # and download the LLaMA-2 model and tokenizer to the path './models/llama2/llama/'.
 #
 # The LLaMA requirement is also needed to be installed.
 #
 # git clone https://github.com/facebookresearch/llama.git
 # cd llama
 # pip install -e .
 models = [
    dict(
        abbr="llama-2-13b",
        type=Llama2,
        path="./models/llama2/llama/llama-2-13b/",
        tokenizer_path="./models/llama2/llama/tokenizer.model",
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=2, num_procs=2),
    ),
 ]
--- a/configs/models/llama/llama2_13b_chat.py
+++ b/configs/models/llama/llama2_13b_chat.py
@ -1,31 +0,0 @@
 from opencompass.models import Llama2Chat
 # Please follow the instruction in the Meta AI website https://github.com/facebookresearch/llama
 # and download the LLaMA-2-Chat model and tokenizer to the path './models/llama2/llama/'.
 #
 # The LLaMA requirement is also needed to be installed.
 #
 # git clone https://github.com/facebookresearch/llama.git
 # cd llama
 # pip install -e .
 api_meta_template = dict(
    round=[
        dict(role="HUMAN", api_role="HUMAN"),
        dict(role="BOT", api_role="BOT", generate=True),
    ],
 )
 models = [
    dict(
        abbr="llama-2-13b-chat",
        type=Llama2Chat,
        path="./models/llama2/llama/llama-2-13b-chat/",
        tokenizer_path="./models/llama2/llama/tokenizer.model",
        meta_template=api_meta_template,
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=2, num_procs=2),
    ),
 ]
--- a/configs/models/llama/llama2_70b.py
+++ b/configs/models/llama/llama2_70b.py
@ -1,23 +0,0 @@
 from opencompass.models import Llama2
 # Please follow the instruction in the Meta AI website https://github.com/facebookresearch/llama
 # and download the LLaMA-2 model and tokenizer to the path './models/llama2/llama/'.
 #
 # The LLaMA requirement is also needed to be installed.
 #
 # git clone https://github.com/facebookresearch/llama.git
 # cd llama
 # pip install -e .
 models = [
    dict(
        abbr="llama-2-70b",
        type=Llama2,
        path="./models/llama2/llama/llama-2-70b/",
        tokenizer_path="./models/llama2/llama/tokenizer.model",
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=8, num_procs=8),
    ),
 ]
--- a/configs/models/llama/llama2_70b_chat.py
+++ b/configs/models/llama/llama2_70b_chat.py
@ -1,31 +0,0 @@
 from opencompass.models import Llama2Chat
 # Please follow the instruction in the Meta AI website https://github.com/facebookresearch/llama
 # and download the LLaMA-2-Chat model and tokenizer to the path './models/llama2/llama/'.
 #
 # The LLaMA requirement is also needed to be installed.
 #
 # git clone https://github.com/facebookresearch/llama.git
 # cd llama
 # pip install -e .
 api_meta_template = dict(
    round=[
        dict(role="HUMAN", api_role="HUMAN"),
        dict(role="BOT", api_role="BOT", generate=True),
    ],
 )
 models = [
    dict(
        abbr="llama-2-70b-chat",
        type=Llama2Chat,
        path="./models/llama2/llama/llama-2-70b-chat/",
        tokenizer_path="./models/llama2/llama/tokenizer.model",
        meta_template=api_meta_template,
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=8, num_procs=8),
    ),
 ]
--- a/configs/models/llama/llama2_7b.py
+++ b/configs/models/llama/llama2_7b.py
@ -1,23 +0,0 @@
 from opencompass.models import Llama2
 # Please follow the instruction in the Meta AI website https://github.com/facebookresearch/llama
 # and download the LLaMA-2 model and tokenizer to the path './models/llama2/llama/'.
 #
 # The LLaMA requirement is also needed to be installed.
 #
 # git clone https://github.com/facebookresearch/llama.git
 # cd llama
 # pip install -e .
 models = [
    dict(
        abbr="llama-2-7b",
        type=Llama2,
        path="./models/llama2/llama/llama-2-7b/",
        tokenizer_path="./models/llama2/llama/tokenizer.model",
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=1, num_procs=1),
    ),
 ]
--- a/configs/models/llama/llama2_7b_chat.py
+++ b/configs/models/llama/llama2_7b_chat.py
@ -1,31 +0,0 @@
 from opencompass.models import Llama2Chat
 # Please follow the instruction in the Meta AI website https://github.com/facebookresearch/llama
 # and download the LLaMA-2-Chat model and tokenizer to the path './models/llama2/llama/'.
 #
 # The LLaMA requirement is also needed to be installed.
 #
 # git clone https://github.com/facebookresearch/llama.git
 # cd llama
 # pip install -e .
 api_meta_template = dict(
    round=[
        dict(role="HUMAN", api_role="HUMAN"),
        dict(role="BOT", api_role="BOT", generate=True),
    ],
 )
 models = [
    dict(
        abbr="llama-2-7b-chat",
        type=Llama2Chat,
        path="./models/llama2/llama/llama-2-7b-chat/",
        tokenizer_path="./models/llama2/llama/tokenizer.model",
        meta_template=api_meta_template,
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=1, num_procs=1),
    ),
 ]
--- a/configs/models/llama/llama_13b.py
+++ b/configs/models/llama/llama_13b.py
@ -1,24 +0,0 @@
 from opencompass.models import Llama2
 # Please follow the instruction in the Meta AI website https://github.com/facebookresearch/llama/tree/llama_v1
 # and download the LLaMA model and tokenizer to the path './models/llama/'.
 #
 # The LLaMA requirement is also needed to be installed.
 # *Note* that the LLaMA-2 branch is fully compatible with LLAMA-1, and the LLaMA-2 branch is used here.
 #
 # git clone https://github.com/facebookresearch/llama.git
 # cd llama
 # pip install -e .
 models = [
    dict(
        abbr="llama-13b",
        type=Llama2,
        path="./models/llama/13B/",
        tokenizer_path="./models/llama/tokenizer.model",
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=2, num_procs=2),
    ),
 ]
--- a/configs/models/llama/llama_30b.py
+++ b/configs/models/llama/llama_30b.py
@ -1,24 +0,0 @@
 from opencompass.models import Llama2
 # Please follow the instruction in the Meta AI website https://github.com/facebookresearch/llama/tree/llama_v1
 # and download the LLaMA model and tokenizer to the path './models/llama/'.
 #
 # The LLaMA requirement is also needed to be installed.
 # *Note* that the LLaMA-2 branch is fully compatible with LLAMA-1, and the LLaMA-2 branch is used here.
 #
 # git clone https://github.com/facebookresearch/llama.git
 # cd llama
 # pip install -e .
 models = [
    dict(
        abbr="llama-30b",
        type=Llama2,
        path="./models/llama/30B/",
        tokenizer_path="./models/llama/tokenizer.model",
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=4, num_procs=4),
    ),
 ]
--- a/configs/models/llama/llama_65b.py
+++ b/configs/models/llama/llama_65b.py
@ -1,24 +0,0 @@
 from opencompass.models import Llama2
 # Please follow the instruction in the Meta AI website https://github.com/facebookresearch/llama/tree/llama_v1
 # and download the LLaMA model and tokenizer to the path './models/llama/'.
 #
 # The LLaMA requirement is also needed to be installed.
 # *Note* that the LLaMA-2 branch is fully compatible with LLAMA-1, and the LLaMA-2 branch is used here.
 #
 # git clone https://github.com/facebookresearch/llama.git
 # cd llama
 # pip install -e .
 models = [
    dict(
        abbr="llama-65b",
        type=Llama2,
        path="./models/llama/65B/",
        tokenizer_path="./models/llama/tokenizer.model",
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=8, num_procs=8),
    ),
 ]
--- a/configs/models/llama/llama_7b.py
+++ b/configs/models/llama/llama_7b.py
@ -1,24 +0,0 @@
 from opencompass.models import Llama2
 # Please follow the instruction in the Meta AI website https://github.com/facebookresearch/llama/tree/llama_v1
 # and download the LLaMA model and tokenizer to the path './models/llama/'.
 #
 # The LLaMA requirement is also needed to be installed.
 # *Note* that the LLaMA-2 branch is fully compatible with LLAMA-1, and the LLaMA-2 branch is used here.
 #
 # git clone https://github.com/facebookresearch/llama.git
 # cd llama
 # pip install -e .
 models = [
    dict(
        abbr="llama-7b",
        type=Llama2,
        path="./models/llama/7B/",
        tokenizer_path="./models/llama/tokenizer.model",
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=1, num_procs=1),
    ),
 ]
--- a/configs/models/openai/gpt_3.5_turbo.py
+++ b/configs/models/openai/gpt_3.5_turbo.py
@ -1,7 +0,0 @@
 from opencompass.models import OpenAI
 models = [
    dict(abbr='GPT-3.5-turbo',
        type=OpenAI, path='gpt-3.5-turbo', key='sk-xxx',
        max_out_len=2048, max_seq_len=2048, batch_size=1)
 ]
--- a/configs/models/opt/hf_opt_125m.py
+++ b/configs/models/opt/hf_opt_125m.py
@ -1,23 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 # OPT-125M
 opt125m = dict(
       type=HuggingFaceCausalLM,
       # the folowing are HuggingFaceCausalLM init parameters
       path='facebook/opt-125m',
       tokenizer_path='facebook/opt-125m',
       tokenizer_kwargs=dict(
           padding_side='left',
           truncation_side='left',
           proxies=None,
           trust_remote_code=True),
       model_kwargs=dict(device_map='auto'),
       max_seq_len=2048,
       # the folowing are not HuggingFaceCausalLM init parameters
       abbr='opt125m',                # Model abbreviation
       max_out_len=100,               # Maximum number of generated tokens
       batch_size=128,
       run_cfg=dict(num_gpus=1),   # Run configuration for specifying resource requirements
    )
 models = [opt125m]
--- a/configs/models/opt/hf_opt_350m.py
+++ b/configs/models/opt/hf_opt_350m.py
@ -1,23 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 # OPT-350M
 opt350m = dict(
       type=HuggingFaceCausalLM,
       # the folowing are HuggingFaceCausalLM init parameters
       path='facebook/opt-350m',
       tokenizer_path='facebook/opt-350m',
       tokenizer_kwargs=dict(
           padding_side='left',
           truncation_side='left',
           proxies=None,
           trust_remote_code=True),
       model_kwargs=dict(device_map='auto'),
       max_seq_len=2048,
       # the folowing are not HuggingFaceCausalLM init parameters
       abbr='opt350m',                    # Model abbreviation
       max_out_len=100,                   # Maximum number of generated tokens          
       batch_size=64,
       run_cfg=dict(num_gpus=1),    # Run configuration for specifying resource requirements
    )
 models = [opt350m]
--- a/configs/models/qwen/hf_qwen_7b.py
+++ b/configs/models/qwen/hf_qwen_7b.py
@ -1,33 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 # Please note that we have specified the revision here. Recently (on 20230827),
 # during our evaluations, we found that the newer revision models have a drop
 # of more than 5 points on datasets like GaokaoBench / mbpp.
 # We are not yet sure whether this drop is due to incorrect logic in OpenCompass
 # calling qwen or some other reasons. We would like to highlight this.
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='qwen-7b-hf',
        path="Qwen/Qwen-7B",
        tokenizer_path='Qwen/Qwen-7B',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
            use_fast=False,
            revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
        ),
        pad_token_id=151643,
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(
            device_map='auto',
            trust_remote_code=True,
            revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
        ),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/vicuna/hf_vicuna_13b_v13.py
+++ b/configs/models/vicuna/hf_vicuna_13b_v13.py
@ -1,22 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='vicuna-13b-v1.3-hf',
        path="lmsys/vicuna-13b-v1.3",
        tokenizer_path='lmsys/vicuna-13b-v1.3',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            use_fast=False,
        ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=2, num_procs=1)
    )
 ]
--- a/configs/models/vicuna/hf_vicuna_13b_v15.py
+++ b/configs/models/vicuna/hf_vicuna_13b_v15.py
@ -1,22 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='vicuna-13b-v1.5-hf',
        path="lmsys/vicuna-13b-v1.5",
        tokenizer_path='lmsys/vicuna-13b-v1.5',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            use_fast=False,
        ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=2, num_procs=1)
    )
 ]
--- a/configs/models/vicuna/hf_vicuna_13b_v15_16k.py
+++ b/configs/models/vicuna/hf_vicuna_13b_v15_16k.py
@ -1,22 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='vicuna-13b-v1.5-16k-hf',
        path="lmsys/vicuna-13b-v1.5-16k",
        tokenizer_path='lmsys/vicuna-13b-v1.5-16k',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            use_fast=False,
        ),
        max_out_len=100,
        max_seq_len=8192,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=2, num_procs=1)
    )
 ]
--- a/configs/models/vicuna/hf_vicuna_33b_v13.py
+++ b/configs/models/vicuna/hf_vicuna_33b_v13.py
@ -1,22 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='vicuna-33b-v1.3-hf',
        path="lmsys/vicuna-33b-v1.3",
        tokenizer_path='lmsys/vicuna-33b-v1.3',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            use_fast=False,
        ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=4, num_procs=1)
    )
 ]
--- a/configs/models/vicuna/hf_vicuna_7b_v13.py
+++ b/configs/models/vicuna/hf_vicuna_7b_v13.py
@ -1,22 +0,0 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='vicuna-7b-v1.3-hf',
        path="lmsys/vicuna-7b-v1.3",
        tokenizer_path='lmsys/vicuna-7b-v1.3',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            use_fast=False,
        ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto'),
        batch_padding=False, # if false, inference with for-loop without batch padding
        run_cfg=dict(num_gpus=1, num_procs=1)
    )
 ]
--- a/Show More
+++ b/Show More