Merge branch 'open-compass:main' into main

2025-05-30 16:03:24 +08:00 · 2024-12-27 14:36:48 +08:00 · 2024-12-27 14:36:48 +08:00 · b5724fc242
commit b5724fc242
parent f8f673a424 ae9efb73ad
41 changed files with 1862 additions and 1106 deletions
--- a/.github/scripts/eval_regression_base_fullbench.py
+++ b/.github/scripts/eval_regression_base_fullbench.py
@ -66,6 +66,8 @@ with read_base():
    from opencompass.configs.summarizers.groups.mmlu_pro import \
        mmlu_pro_summary_groups  # noqa: F401, E501
    from ...volc import infer as volc_infer  # noqa: F401, E501
 race_datasets = [race_datasets[1]]  # Only take RACE-High
 humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
 bbh_datasets = [
@ -99,61 +101,66 @@ GaokaoBench_datasets = [
 ]
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
 summary_groups = sum(
    [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
 summary_groups.append(
    {
        'name': 'Mathbench',
        'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
    }, )
 summarizer = dict(
    dataset_abbrs=[
        'Language',
        ['race-high', 'accuracy'],
        ['ARC-c', 'accuracy'],
        ['BoolQ', 'accuracy'],
-        ['mmlu_pro', 'naive_average'],
+        ['triviaqa_wiki_1shot', 'score'],
-        ['GPQA_diamond', 'accuracy'],
+        ['nq_open_1shot', 'score'],
-        ['cmmlu', 'naive_average'],
+        '',
-        ['mmlu', 'naive_average'],
+        'General Reasoning',
        ['drop', 'accuracy'],
        ['bbh', 'naive_average'],
        ['GPQA_diamond', 'accuracy'],
        ['hellaswag', 'accuracy'],
        ['TheoremQA', 'score'],
        ['winogrande', 'accuracy'],
        '',
        'Math Calculation',
        ['gsm8k', 'accuracy'],
        ['GaokaoBench', 'weighted_average'],
        'GaokaoBench_2010-2022_Math_II_MCQs',
        'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
        ['math', 'accuracy'],
        ['Mathbench', 'naive_average'],
        '',
        'Knowledge',
        ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
        ['cmmlu', 'naive_average'],
        ['mmlu', 'naive_average'],
        ['mmlu_pro', 'naive_average'],
        '',
        'Code',
        ['openai_humaneval', 'humaneval_pass@1'],
        ['openai_humaneval_v2', 'humaneval_pass@1'],
        ['sanitized_mbpp', 'score'],
-        ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
+        '',
        ['gsm8k', 'accuracy'],
        ['GaokaoBench', 'weighted_average'],
        ['triviaqa_wiki_1shot', 'score'],
        ['nq_open_1shot', 'score'],
        ['winogrande', 'accuracy'],
        ['hellaswag', 'accuracy'],
        ['TheoremQA', 'score'],
        ['dingo_en_192', 'score'],
        ['dingo_zh_170', 'score'],
        '###### MathBench-A: Application Part ######',
        'college',
        'high',
        'middle',
        'primary',
        'arithmetic',
        'mathbench-a (average)',
        '###### MathBench-T: Theory Part ######',
        'college_knowledge',
        'high_knowledge',
        'middle_knowledge',
        'primary_knowledge',
        'mathbench-t (average)',
        '###### Overall: Average between MathBench-A and MathBench-T ######',
        'Overall',
        '',
        'bbh-logical_deduction_seven_objects',
        'bbh-multistep_arithmetic_two',
        '',
        'mmlu',
        'mmlu-stem',
        'mmlu-social-science',
        'mmlu-humanities',
        ['mmlu-other', 'accuracy'],
        '',
        'cmmlu',
        'cmmlu-stem',
        'cmmlu-social-science',
        'cmmlu-humanities',
        'cmmlu-other',
        ['cmmlu-china-specific', 'accuracy'],
        '',
        'mmlu_pro',
        'mmlu_pro_biology',
        'mmlu_pro_business',
@ -169,9 +176,24 @@ summarizer = dict(
        'mmlu_pro_physics',
        'mmlu_pro_psychology',
        'mmlu_pro_other',
        '',
        'bbh-logical_deduction_seven_objects',
        'bbh-multistep_arithmetic_two',
        '###### MathBench-A: Application Part ######',
        'college',
        'high',
        'middle',
        'primary',
        'arithmetic',
        'mathbench-a (average)',
        '###### MathBench-T: Theory Part ######',
        'college_knowledge',
        'high_knowledge',
        'middle_knowledge',
        'primary_knowledge',
        'mathbench-t (average)',
    ],
-    summary_groups=sum(
+    summary_groups=summary_groups,
        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
 )
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
--- a/.github/scripts/eval_regression_base_models.py
+++ b/.github/scripts/eval_regression_base_models.py
@ -13,12 +13,22 @@ with read_base():
    # read hf models - chat models
    from opencompass.configs.models.chatglm.hf_glm4_9b import \
        models as hf_glm4_9b_model  # noqa: F401, E501
    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
        models as lmdeploy_glm4_9b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
        models as hf_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
        models as hf_deepseek_67b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
        models as hf_deepseek_v2_lite_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
        models as lmdeploy_deepseek_67b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2 import \
        lmdeploy_deepseek_v2_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
        models as vllm_deepseek_moe_16b_base_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_2b import \
@ -29,6 +39,8 @@ with read_base():
        models as hf_gemma_2b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma_7b import \
        models as hf_gemma_7b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.lmdeploy_gemma_9b import \
        models as lmdeploy_gemma_9b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.vllm_gemma_2b import \
        models as vllm_gemma_2b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.vllm_gemma_7b import \
@ -59,10 +71,14 @@ with read_base():
        models as hf_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
        models as hf_llama3_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_70b import \
        models as hf_llama3_70b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
        models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
        models as lmdeploy_llama3_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \
        models as lmdeploy_llama3_70b_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
        models as hf_mistral_7b_v0_2_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
@ -73,10 +89,16 @@ with read_base():
        models as hf_qwen_2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
        models as hf_qwen_2_5_14b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen_2_5_32b import \
        models as hf_qwen_2_5_32b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
        models as lmdeploy_qwen2_5_1_5b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
        models as lmdeploy_qwen2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b import \
        models as lmdeploy_qwen2_5_32b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b import \
        models as lmdeploy_qwen2_5_72b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
        models as hf_qwen1_5_moe_a2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
@ -95,6 +117,10 @@ with read_base():
        models as hf_yi_1_5_6b_model  # noqa: F401, E501
    from opencompass.configs.models.yi.hf_yi_1_5_9b import \
        models as hf_yi_1_5_9b_model  # noqa: F401, E501
    from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b import \
        models as lmdeploy_yi_1_5_9b_model  # noqa: F401, E501
    from ...volc import infer as volc_infer  # noqa: F401, E501
 race_datasets = [race_datasets[1]]
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
--- a/.github/scripts/eval_regression_chat_models.py
+++ b/.github/scripts/eval_regression_chat_models.py
@ -7,8 +7,6 @@ with read_base():
    from opencompass.configs.datasets.race.race_gen import \
        race_datasets  # noqa: F401, E501
    # read hf models - chat models
    from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
        models as hf_baichuan2_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
        models as hf_glm4_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
@ -17,22 +15,30 @@ with read_base():
        models as vllm_glm4_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
        models as hf_deepseek_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
        models as hf_deepseek_67b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
        models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
        models as hf_deepseek_v2_lite_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
        models as lmdeploy_deepseek_v2_5_1210_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
        models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
        models as hf_gemma2_2b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
        models as hf_gemma2_9b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_27b_it import \
        models as hf_gemma2_27b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma_2b_it import \
        models as hf_gemma_2b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma_7b_it import \
        models as hf_gemma_7b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
        models as lmdeploy_gemma_9b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
        models as lmdeploy_gemma_27b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
        models as vllm_gemma_7b_it_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
@ -65,6 +71,8 @@ with read_base():
        models as lmdeploy_llama3_1_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
        models as lmdeploy_llama3_2_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_3_70b_instruct import \
        models as lmdeploy_llama3_3_70b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
        models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
@ -75,6 +83,13 @@ with read_base():
        models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
        models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
        models as hf_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
        models as hf_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
        models as \
        lmdeploy_mistral_large_instruct_2411_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
        models as lmdeploy_mistral_nemo_instruct_2407_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
@ -84,22 +99,28 @@ with read_base():
        models as vllm_mistral_7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
        models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
    from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
        models as lmdeploy_nemotron_70b_instruct_hf_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm3_4b import \
        models as hf_minicpm3_4b_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
        models as hf_minicpm_2b_dpo_fp32_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
        models as hf_minicpm_2b_sft_bf16_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
        models as hf_minicpm_2b_sft_fp32_model  # noqa: F401, E501
    from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
        models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
        models as hf_phi_3_mini_8k_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
        models as hf_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
        models as hf_qwen2_5_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \
        models as hf_qwen2_5_14b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import \
        models as lmdeploy_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import \
        models as lmdeploy_qwen2_5_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
        models as lmdeploy_qwen2_5_14b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
        models as lmdeploy_qwen2_5_72b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
        models as hf_qwen1_5_0_5b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
@ -116,6 +137,14 @@ with read_base():
        models as hf_yi_1_5_6b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
        models as hf_yi_1_5_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.yi.lmdeploy_yi_1_5_6b_chat import \
        models as lmdeploy_yi_1_5_6b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
        models as lmdeploy_yi_1_5_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import \
        models as lmdeploy_yi_1_5_34b_chat_model  # noqa: F401, E501
    from ...volc import infer as volc_infer  # noqa: F401, E501
 race_datasets = [race_datasets[1]]
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
--- a/.github/scripts/eval_regression_chat_objective_fullbench.py
+++ b/.github/scripts/eval_regression_chat_objective_fullbench.py
@ -7,8 +7,14 @@ with read_base():
        aime2024_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
        ARC_c_datasets  # noqa: F401, E501
    # remove because of oom
    # from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
        bbh_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_complete_gen_faf748 import \
        bigcodebench_hard_complete_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_8815eb import \
        bigcodebench_hard_instruct_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
        cmmlu_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
@ -26,15 +32,17 @@ with read_base():
        gsm8k_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
        hellaswag_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
        humaneval_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \
+    from opencompass.configs.datasets.humanevalx.humanevalx_gen_3d84a3 import \
        humanevalx_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
+    from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
        ifeval_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
        korbench_0shot_single_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
        LCB_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
+    from opencompass.configs.datasets.math.math_0shot_gen_11c4b5 import \
        math_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
        mathbench_datasets  # noqa: F401, E501
@ -71,6 +79,7 @@ with read_base():
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
    # Summary Groups
    # Summary Groups
    from opencompass.configs.summarizers.groups.bbh import \
        bbh_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.cmmlu import \
@ -81,6 +90,8 @@ with read_base():
        GaokaoBench_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.humanevalx import \
        humanevalx_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.korbench import \
        korbench_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
        mathbench_2024_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.mmlu import \
@ -96,6 +107,8 @@ with read_base():
    from opencompass.configs.summarizers.mmmlu_lite import \
        mmmlu_summary_groups  # noqa: F401, E501
    from ...volc import infer as volc_infer  # noqa: F401, E501
 # For HumanEval-X Evaluation
 # Apply the evaluator ip_address and port
 race_datasets = [race_datasets[1]]
@ -185,6 +198,8 @@ summarizer = dict(
        ['hellaswag', 'accuracy'],
        ['TheoremQA', 'score'],
        ['musr_average', 'naive_average'],
        ['korbench_single', 'naive_average'],
        ['ARC_Prize_Public_Evaluation', 'accuracy'],
        '',
        'Math Calculation',
        ['gsm8k', 'accuracy'],
@ -208,6 +223,8 @@ summarizer = dict(
        ['lcb_code_generation', 'pass@1'],
        ['lcb_code_execution', 'pass@1'],
        ['lcb_test_output', 'pass@1'],
        ['bigcodebench_hard_instruct', 'pass@1'],
        ['bigcodebench_hard_complete', 'pass@1'],
        '',
        'Agent',
        ['teval', 'naive_average'],
--- a/.github/scripts/eval_regression_chat_sub_fullbench.py
+++ b/.github/scripts/eval_regression_chat_sub_fullbench.py
@ -0,0 +1,182 @@
 from copy import deepcopy
 from mmengine.config import read_base
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.summarizers import DefaultSubjectiveSummarizer
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 with read_base():
    # read hf models - chat models
    # Dataset
    from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \
        csimpleqa_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \
        simpleqa_datasets  # noqa: F401, E501; noqa: F401, E501
    from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \
        alignbench_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \
        alpacav2_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \
        arenahard_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \
        compassarena_datasets  # noqa: F401, E501
    # from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import fofo_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \
        followbench_llmeval_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \
        mtbench101_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \
        wildbench_datasets  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
    from ...volc import infer as volc_infer  # noqa: F401, E501
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
                and 'mtbench101' not in k and 'wildbench' not in k), [])
 datasets += mtbench101_datasets  # noqa: F401, E501
 datasets += wildbench_datasets  # noqa: F401, E501
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ],
    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 for m in models:
    m['abbr'] = m['abbr'] + '_fullbench'
    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
        m['engine_config']['max_batch_size'] = 1
        m['batch_size'] = 1
 models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
 judge_models = deepcopy([models[1]])
 judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
        models=models,
        judge_models=judge_models,
    ),
    runner=dict(type=LocalRunner,
                max_num_workers=16,
                task=dict(type=SubjectiveEvalTask)),
 )
 summary_groups = []
 summary_groups.append({
    'name': 'compassarena_language',
    'subsets': [
        ['compassarena_language', '内容总结'],
    ],
 })
 summary_groups.append({
    'name': 'compassarena_knowledge',
    'subsets': [
        ['compassarena_knowledge', '生活常识_ZH'],
    ],
 })
 summary_groups.append({
    'name': 'compassarena_reason_v2',
    'subsets': [
        ['compassarena_reason_v2', 'reasoning'],
    ],
 })
 summary_groups.append({
    'name': 'compassarena_math_v2',
    'subsets': [
        ['compassarena_math_v2', '高等数学_ZH'],
    ],
 })
 summary_groups.append({
    'name': 'compassarena_creationv2_zh',
    'subsets': [
        ['compassarena_creationv2_zh', '内容扩写_ZH'],
    ],
 })
 summary_groups.append({
    'name':
    'CompassArena',
    'subsets': [
        'compassarena_language',
        'compassarena_knowledge',
        'compassarena_reason_v2',
        'compassarena_math_v2',
        'compassarena_creationv2_zh',
    ],
 })
 summary_groups.append({
    'name':
    'FoFo',
    'subsets': [['fofo_test_prompts', 'overall'],
                ['fofo_test_prompts_cn', 'overall']],
 })
 summary_groups.append({
    'name':
    'Followbench',
    'subsets': [
        ['followbench_llmeval_en', 'HSR_AVG'],
        ['followbench_llmeval_en', 'SSR_AVG'],
    ],
 })
 # Summarizer
 summarizer = dict(
    dataset_abbrs=[
        ['alignment_bench_v1_1', '总分'],
        ['alpaca_eval', 'total'],
        ['arenahard', 'score'],
        ['Followbench', 'naive_average'],
        ['CompassArena', 'naive_average'],
        ['FoFo', 'naive_average'],
        ['mtbench101', 'avg'],
        ['wildbench', 'average'],
        ['simpleqa', 'accuracy_given_attempted'],
        ['chinese_simpleqa', 'given_attempted_accuracy'],
        '',
        ['alignment_bench_v1_1', '专业能力'],
        ['alignment_bench_v1_1', '数学计算'],
        ['alignment_bench_v1_1', '基本任务'],
        ['alignment_bench_v1_1', '逻辑推理'],
        ['alignment_bench_v1_1', '中文理解'],
        ['alignment_bench_v1_1', '文本写作'],
        ['alignment_bench_v1_1', '角色扮演'],
        ['alignment_bench_v1_1', '综合问答'],
        ['alpaca_eval', 'helpful_base'],
        ['alpaca_eval', 'koala'],
        ['alpaca_eval', 'oasst'],
        ['alpaca_eval', 'selfinstruct'],
        ['alpaca_eval', 'vicuna'],
        ['compassarena_language', 'naive_average'],
        ['compassarena_knowledge', 'naive_average'],
        ['compassarena_reason_v2', 'naive_average'],
        ['compassarena_math_v2', 'naive_average'],
        ['compassarena_creationv2_zh', 'naive_average'],
        ['fofo_test_prompts', 'overall'],
        ['fofo_test_prompts_cn', 'overall'],
        ['followbench_llmeval_en', 'HSR_AVG'],
        ['followbench_llmeval_en', 'SSR_AVG'],
        ['followbench_llmeval_en', 'HSR_L1'],
        ['followbench_llmeval_en', 'HSR_L2'],
        ['followbench_llmeval_en', 'HSR_L3'],
        ['followbench_llmeval_en', 'HSR_L4'],
        ['followbench_llmeval_en', 'HSR_L5'],
        ['followbench_llmeval_en', 'SSR_L1'],
        ['followbench_llmeval_en', 'SSR_L2'],
        ['followbench_llmeval_en', 'SSR_L3'],
        ['followbench_llmeval_en', 'SSR_L4'],
        ['followbench_llmeval_en', 'SSR_L5'],
        ['simpleqa', 'f1'],
    ],
    type=DefaultSubjectiveSummarizer,
    summary_groups=summary_groups,
 )
--- a/.github/scripts/eval_regression_chat_subjective_fullbench.py
+++ b/.github/scripts/eval_regression_chat_subjective_fullbench.py
@ -1,70 +0,0 @@
 from copy import deepcopy
 from mmengine.config import read_base
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.summarizers import SubjectiveSummarizer
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 with read_base():
    # read hf models - chat models
    # Dataset
    from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
        alignbench_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
        alpacav2_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
        arenahard_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
        compassarena_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
        fofo_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \
        followbench_llmeval_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
        mtbench101_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
        wildbench_datasets  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
 summarizer = dict(type=SubjectiveSummarizer, function='subjective')
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
                and 'mtbench101' not in k and 'wildbench' not in k), [])
 datasets += mtbench101_datasets  # noqa: F401, E501
 datasets += wildbench_datasets  # noqa: F401, E501
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ],
    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 for m in models:
    m['abbr'] = m['abbr'] + '_fullbench'
    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
        m['engine_config']['max_batch_size'] = 1
        m['batch_size'] = 1
 models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
 judge_models = deepcopy([models[1]])
 judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
        models=models,
        judge_models=judge_models,
    ),
    runner=dict(type=LocalRunner,
                max_num_workers=16,
                task=dict(type=SubjectiveEvalTask)),
 )
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -6,47 +6,19 @@ import yaml
 output_path = 'regression_result_daily'
-chat_model_list = [
+
-    'baichuan2-7b-chat-hf', 'glm-4-9b-chat-hf', 'glm-4-9b-chat-turbomind',
+def model_list(type):
-    'glm-4-9b-chat-vllm', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
+    config_path = '.github/scripts/oc_score_baseline_testrange.yaml'
-    'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf',
+    with open(config_path) as f:
-    'gemma-2b-it-hf', 'gemma-7b-it-hf', 'gemma-2-9b-it-turbomind',
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
-    'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
+    return config.get(type).keys()
-    'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
+
-    'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-lmdeploy',
+
-    'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm',
+def dataset_list(model, type):
-    'llama-3_1-8b-instruct-hf', 'llama-3_2-3b-instruct-hf',
+    config_path = '.github/scripts/oc_score_baseline_fullbench.yaml'
-    'llama-3-8b-instruct-hf', 'llama-3_1-8b-instruct-turbomind',
+    with open(config_path) as f:
-    'llama-3_2-3b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
-    'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.3-hf',
+    return config.get(model).get(type).keys()
    'mistral-nemo-instruct-2407-hf', 'mistral-nemo-instruct-2407-turbomind',
    'mistral-7b-instruct-v0.1-vllm', 'mistral-7b-instruct-v0.2-vllm',
    'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
    'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
    'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
    'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
    'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
    'deepseek-v2-lite-chat-hf', 'internlm2_5-20b-chat-hf',
    'internlm2_5-20b-chat-turbomind', 'mistral-small-instruct-2409-hf',
    'mistral-small-instruct-2409-turbomind', 'qwen2.5-14b-instruct-hf',
    'qwen2.5-14b-instruct-turbomind'
 ]
 base_model_list = [
    'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
    'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf',
    'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm',
    'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
    'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind',
    'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf',
    'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind',
    'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf',
    'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind',
    'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
    'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
    'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf',
    'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf',
    'internlm2-20b-turbomind', 'qwen2.5-14b-hf'
 ]
@pytest.fixture()
@ -88,35 +60,39 @@ def result_scores():
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_testrange')
-@pytest.mark.chat
+@pytest.mark.chat_models
 class TestChat:
    """Test cases for chat model."""
-    @pytest.mark.parametrize('model, dataset',
+    @pytest.mark.parametrize(
-                             [(p1, p2) for p1 in chat_model_list
+        'model, dataset', [(p1, p2) for p1 in model_list('chat')
-                              for p2 in ['gsm8k', 'race-high']])
+                           for p2 in ['gsm8k_accuracy', 'race-high_accuracy']])
    def test_model_dataset_score(self, baseline_scores_testrange,
                                 result_scores, model, dataset):
-        base_score = baseline_scores_testrange.get(model).get(dataset)
+        base_score = baseline_scores_testrange.get('chat').get(model).get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_testrange')
-@pytest.mark.base
+@pytest.mark.base_models
 class TestBase:
    """Test cases for base model."""
-    @pytest.mark.parametrize(
+    @pytest.mark.parametrize('model, dataset',
-        'model, dataset',
+                             [(p1, p2) for p1 in model_list('base') for p2 in [
-        [(p1, p2) for p1 in base_model_list
+                                 'gsm8k_accuracy', 'GPQA_diamond_accuracy',
-         for p2 in ['gsm8k', 'GPQA_diamond', 'race-high', 'winogrande']])
+                                 'race-high_accuracy', 'winogrande_accuracy'
                             ]])
    def test_model_dataset_score(self, baseline_scores_testrange,
                                 result_scores, model, dataset):
-        if model in ['gemma-2b-vllm', 'gemma-7b-vllm'] and dataset != 'gsm8k':
+        if model in ['gemma-2b-vllm', 'gemma-7b-vllm'
                     ] and dataset != 'gsm8k_accuracy':
            return
-        base_score = baseline_scores_testrange.get(model).get(dataset)
+        base_score = baseline_scores_testrange.get('base').get(model).get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score)
@ -130,21 +106,11 @@ class TestChatObjFullbench:
    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
        'internlm2_5-7b-chat-hf_fullbench',
        'internlm2_5-7b-chat-turbomind_fullbench'
-    ] for p2 in [
+    ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'objective')])
        'race-high', 'ARC-c', 'BoolQ', 'triviaqa_wiki_1shot', 'nq_open_1shot',
        'IFEval', 'drop', 'GPQA_diamond', 'hellaswag', 'TheoremQA',
        'musr_average', 'gsm8k', 'math', 'cmo_fib', 'aime2024',
        'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000',
        'lcb_code_generation', 'lcb_code_execution', 'lcb_test_output',
        'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
        'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas',
        'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn',
        'ds1000_Pytorch', 'ds1000_Matplotlib', 'openai_mmmlu_lite_AR-XY',
        'college', 'college_knowledge'
    ]])
    def test_model_dataset_score(self, baseline_scores_fullbench,
                                 result_scores, model, dataset):
-        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        base_score = baseline_scores_fullbench.get(model).get('objective').get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score)
@ -158,22 +124,12 @@ class TestChatSubFullbench:
    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
        'internlm2_5-7b-chat-hf_fullbench',
        'internlm2_5-7b-chat-turbomind_fullbench'
-    ] for p2 in [
+    ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'subjective')]
-        'Alignbench总分', 'Alignbench专业能力', 'AlpacaEvaltotal',
+                             )
        'AlpacaEvalhelpful_base', 'CompassArenacompassarena_language',
        'CompassArenacompassarena_knowledge',
        'CompassArenacompassarena_reason_v2',
        'CompassArenacompassarena_math_v2',
        'CompassArenacompassarena_creationv2_zh', 'Fofofofo_test_prompts',
        'followbenchHSR_AVG', 'followbenchSSR_AVG', 'followbenchHSR_L1',
        'followbenchHSR_L2', 'followbenchHSR_L3', 'followbenchHSR_L4',
        'followbenchHSR_L5', 'followbenchSSR_L1', 'followbenchSSR_L2',
        'followbenchSSR_L3', 'followbenchSSR_L4', 'followbenchSSR_L5',
        'MTBench101average', 'Wildbenchscore'
    ]])
    def test_model_dataset_score(self, baseline_scores_fullbench,
                                 result_scores, model, dataset):
-        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        base_score = baseline_scores_fullbench.get(model).get(
            'subjective').get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score)
@ -184,20 +140,15 @@ class TestChatSubFullbench:
 class TestBaseFullbench:
    """Test cases for chat model."""
-    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
+    @pytest.mark.parametrize(
-        'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench'
+        'model, dataset',
-    ] for p2 in [
+        [(p1, p2) for p1 in
-        'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
+         ['internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench']
-        'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k',
+         for p2 in dataset_list('internlm2_5-7b-hf_fullbench', 'objective')])
        'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag',
        'TheoremQA', 'dingo_en_192', 'dingo_zh_170', 'college',
        'college_knowledge', 'bbh-logical_deduction_seven_objects',
        'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific',
        'mmlu_pro_math'
    ]])
    def test_model_dataset_score(self, baseline_scores_fullbench,
                                 result_scores, model, dataset):
-        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        base_score = baseline_scores_fullbench.get(model).get('objective').get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score)
@ -209,40 +160,109 @@ class TestApibench:
    """Test cases for chat model."""
    @pytest.mark.parametrize('model, dataset',
-                             [('lmdeploy-api-test', 'race-middle'),
+                             [('lmdeploy-api-test', 'race-middle_accuracy'),
-                              ('lmdeploy-api-test', 'race-high'),
+                              ('lmdeploy-api-test', 'race-high_accuracy'),
-                              ('lmdeploy-api-test', 'gsm8k')])
+                              ('lmdeploy-api-test', 'gsm8k_accuracy')])
    def test_api(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.volc_fullbench
 class TestVolcFullbench:
    """Test cases for chat model."""
    @pytest.mark.parametrize(
        'model, dataset',
        [(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
         for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
    @pytest.mark.chat_objective
    def test_chat_objective(self, baseline_scores_fullbench, result_scores,
                            model, dataset):
        base_score = baseline_scores_fullbench.get(model).get('objective').get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score)
    @pytest.mark.parametrize('model, dataset', [
        (p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
        for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'subjective')
    ])
    @pytest.mark.chat_subjective
    def test_chat_subjective(self, baseline_scores_fullbench, result_scores,
                             model, dataset):
        base_score = baseline_scores_fullbench.get(model).get(
            'subjective').get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score)
    @pytest.mark.parametrize(
        'model, dataset',
        [(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
         for p2 in dataset_list('internlm2_5-7b-turbomind', 'objective')])
    @pytest.mark.base_objective
    def test_base_objective(self, baseline_scores_fullbench, result_scores,
                            model, dataset):
        base_score = baseline_scores_fullbench.get(model).get('objective').get(
            dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score)
    @pytest.mark.parametrize(
        'model, dataset',
        [(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
         for p2 in dataset_list('internlm2_5-7b-turbomind', 'long_context')])
    @pytest.mark.base_long_context
    def test_base_long_context(self, baseline_scores_fullbench, result_scores,
                               model, dataset):
        base_score = baseline_scores_fullbench.get(model).get(
            'long_context').get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score)
    @pytest.mark.parametrize(
        'model, dataset',
        [(p1, p2)
         for p1 in ['internlm2_5-7b-chat-1m-turbomind'] for p2 in dataset_list(
             'internlm2_5-7b-chat-1m-turbomind', 'long_context')])
    @pytest.mark.chat_long_context
    def test_chat_long_context(self, baseline_scores_fullbench, result_scores,
                               model, dataset):
        base_score = baseline_scores_fullbench.get(model).get(
            'long_context').get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
 class TestCmdCase:
    @pytest.mark.case1
    @pytest.mark.parametrize('model, dataset',
-                             [('internlm2_5-7b-hf', 'race-middle'),
+                             [('internlm2_5-7b-hf', 'race-middle_accuracy'),
-                              ('internlm2_5-7b-hf', 'race-high'),
+                              ('internlm2_5-7b-hf', 'race-high_accuracy'),
-                              ('internlm2_5-7b-hf', 'demo_gsm8k'),
+                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
-                              ('internlm2-1.8b-hf', 'race-middle'),
+                              ('internlm2-1.8b-hf', 'race-middle_accuracy'),
-                              ('internlm2-1.8b-hf', 'race-high'),
+                              ('internlm2-1.8b-hf', 'race-high_accuracy'),
-                              ('internlm2-1.8b-hf', 'demo_gsm8k')])
+                              ('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
    def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score)
    @pytest.mark.case2
-    @pytest.mark.parametrize('model, dataset',
+    @pytest.mark.parametrize(
-                             [('internlm2_5-7b-chat-lmdeploy', 'race-middle'),
+        'model, dataset',
-                              ('internlm2_5-7b-chat-lmdeploy', 'race-high'),
+        [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
-                              ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k'),
+         ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
-                              ('internlm2-chat-1.8b-lmdeploy', 'race-middle'),
+         ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
-                              ('internlm2-chat-1.8b-lmdeploy', 'race-high'),
+         ('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
-                              ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k')])
+         ('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
         ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
    def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
@ -250,19 +270,19 @@ class TestCmdCase:
    @pytest.mark.case3
    @pytest.mark.parametrize('model, dataset',
-                             [('internlm2_5-7b_hf', 'race-middle'),
+                             [('internlm2_5-7b_hf', 'race-middle_accuracy'),
-                              ('internlm2_5-7b_hf', 'race-high'),
+                              ('internlm2_5-7b_hf', 'race-high_accuracy'),
-                              ('internlm2_5-7b_hf', 'demo_gsm8k')])
+                              ('internlm2_5-7b_hf', 'demo_gsm8k_accuracy')])
    def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model, result_score, base_score)
    @pytest.mark.case4
-    @pytest.mark.parametrize('model, dataset',
+    @pytest.mark.parametrize(
-                             [('internlm2_5-7b-chat_hf', 'race-middle'),
+        'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
-                              ('internlm2_5-7b-chat_hf', 'race-high'),
+                           ('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
-                              ('internlm2_5-7b-chat_hf', 'demo_gsm8k')])
+                           ('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
    def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
@ -310,8 +330,7 @@ def find_csv_files(directory):
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
-            if file.endswith('.csv') and (file.startswith('summary') or
+            if file.endswith('.csv') and file.startswith('summary'):
                                          file.startswith('Subjective_all')):
                csv_files.append(os.path.join(root, file))
    csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
@ -324,24 +343,15 @@ def read_csv_file(file_path):
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        filtered_data = []
-        if 'Subjective_all' not in file_path:
+        for row in reader:
-            for row in reader:
+            if row['metric'] is not None and 'bpb' not in row[
-                if row['metric'] is not None and 'bpb' not in row['metric']:
+                    'metric'] and '_' != row['metric']:
-                    filtered_row = {
+                filtered_row = row
-                        k: v
+                filtered_row['dataset'] = row['dataset'] + '_' + row['metric']
-                        for k, v in row.items()
+                del filtered_row['version']
-                        if k not in ['version', 'metric', 'mode']
+                del filtered_row['metric']
-                    }
+                del filtered_row['mode']
-                    filtered_data.append(filtered_row)
+                filtered_data.append(filtered_row)
        else:
            for row in reader:
                if row['Detailed Scores'] is not None:
                    filtered_row = row
                    filtered_row['dataset'] = filtered_row[
                        'Dataset'] + filtered_row['Detailed Scores']
                    del filtered_row['Dataset']
                    del filtered_row['Detailed Scores']
                    filtered_data.append(filtered_row)
    result = {}
    for data in filtered_data:
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -1,34 +1,34 @@
 internlm2_5-7b-hf:
-    demo_gsm8k: 42.19
+    demo_gsm8k_accuracy: 42.19
-    race-middle: 91.78
+    race-middle_accuracy: 91.78
-    race-high: 90.02
+    race-high_accuracy: 90.02
 internlm2_5-7b_hf:
-    demo_gsm8k: 42.19
+    demo_gsm8k_accuracy: 42.19
-    race-middle: 91.78
+    race-middle_accuracy: 91.78
-    race-high: 90.02
+    race-high_accuracy: 90.02
 internlm2-1.8b-hf:
-    demo_gsm8k: 15.62
+    demo_gsm8k_accuracy: 15.62
-    race-middle: 71.66
+    race-middle_accuracy: 71.66
-    race-high: 66.38
+    race-high_accuracy: 66.38
 internlm2_5-7b-chat-lmdeploy:
-    demo_gsm8k: 84.38
+    demo_gsm8k_accuracy: 89.06
-    race-middle: 92.76
+    race-middle_accuracy: 92.76
-    race-high: 90.54
+    race-high_accuracy: 90.54
 internlm2-chat-1.8b-lmdeploy:
-    demo_gsm8k: 31
+    demo_gsm8k_accuracy: 32
-    race-middle: 81.34
+    race-middle_accuracy: 81.34
-    race-high: 73.96
+    race-high_accuracy: 73.96
 internlm2_5-7b-chat_hf:
-    demo_gsm8k: 87.50
+    demo_gsm8k_accuracy: 87.50
-    race-middle: 92.76
+    race-middle_accuracy: 92.76
-    race-high: 90.48
+    race-high_accuracy: 90.48
 lmdeploy-api-test:
-    gsm8k: 83.78
+    gsm8k_accuracy: 83.78
-    race-middle: 92.41
+    race-middle_accuracy: 92.41
-    race-high: 90.37
+    race-high_accuracy: 90.37
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -1,173 +1,456 @@
 internlm2_5-7b-chat-hf_fullbench:
-    race-high: 93.75
+    objective:
-    ARC-c: 93.75
+        race-high_accuracy: 93.75
-    BoolQ: 81.25
+        ARC-c_accuracy: 93.75
-    triviaqa_wiki_1shot: 50
+        BoolQ_accuracy: 81.25
-    nq_open_1shot: 25
+        triviaqa_wiki_1shot_score: 50
-    IFEval: 50
+        nq_open_1shot_score: 25
-    drop: 81.25
+        IFEval_Prompt-level-strict-accuracy: 50
-    GPQA_diamond: 25
+        drop_accuracy: 81.25
-    hellaswag: 87.5
+        GPQA_diamond_accuracy: 25
-    TheoremQA: 18.75
+        hellaswag_accuracy: 87.5
-    musr_average: 39.58
+        TheoremQA_score: 18.75
-    gsm8k: 56.25
+        musr_average_naive_average: 39.58
-    math: 75
+        korbench_single_naive_average: 40
-    cmo_fib: 6.25
+        gsm8k_accuracy: 62.50
-    aime2024: 6.25
+        math_accuracy: 75
-    wikibench-wiki-single_choice_cncircular: 50
+        cmo_fib_accuracy: 6.25
-    sanitized_mbpp: 68.75
+        aime2024_accuracy: 6.25
-    ds1000: 16.96
+        wikibench-wiki-single_choice_cncircular_perf_4: 50
-    lcb_code_generation: 12.5
+        sanitized_mbpp_score: 68.75
-    lcb_code_execution: 43.75
+        ds1000_naive_average: 16.96
-    lcb_test_output: 18.75
+        lcb_code_generation_pass@1: 12.5
-    bbh-logical_deduction_seven_objects: 50
+        lcb_code_execution_pass@1: 43.75
-    bbh-multistep_arithmetic_two: 68.75
+        lcb_test_output_pass@1: 18.75
-    mmlu-other: 72.6
+        bbh-logical_deduction_seven_objects_score: 50
-    cmmlu-china-specific: 76.25
+        bbh-multistep_arithmetic_two_score: 68.75
-    mmlu_pro_math: 25
+        mmlu-other_naive_average: 72.6
-    ds1000_Pandas: 12.5
+        cmmlu-china-specific_naive_average: 76.25
-    ds1000_Numpy: 0
+        mmlu_pro_math_accuracy: 25
-    ds1000_Tensorflow: 12.5
+        ds1000_Pandas_accuracy: 12.5
-    ds1000_Scipy: 18.75
+        ds1000_Numpy_accuracy: 0
-    ds1000_Sklearn: 18.75
+        ds1000_Tensorflow_accuracy: 12.5
-    ds1000_Pytorch: 12.5
+        ds1000_Scipy_accuracy: 18.75
-    ds1000_Matplotlib: 43.75
+        ds1000_Sklearn_accuracy: 18.75
-    openai_mmmlu_lite_AR-XY: 37.5
+        ds1000_Pytorch_accuracy: 12.5
-    college: 12.5
+        ds1000_Matplotlib_accuracy: 43.75
-    college_knowledge: 87.5
+        openai_mmmlu_lite_AR-XY_accuracy: 37.5
-    Alignbench总分: 0.65
+        college_naive_average: 12.5
-    Alignbench专业能力: 7.83
+        college_knowledge_naive_average: 87.5
-    AlpacaEvaltotal: 0
+    subjective:
-    AlpacaEvalhelpful_base: 0
+        alignment_bench_v1_1_总分: 0.66
-    CompassArenacompassarena_language: 60
+        alpaca_eval_total: 20
-    CompassArenacompassarena_knowledge: 56
+        arenahard_score: 50
-    CompassArenacompassarena_reason_v2: 50
+        Followbench_naive_average: 1
-    CompassArenacompassarena_math_v2: 53.5
+        CompassArena_naive_average: 44.00
-    CompassArenacompassarena_creationv2_zh: 48.75
+        mtbench101_avg: 7.8
-    Fofofofo_test_prompts: 1
+        wildbench_average: -12.78
-    followbenchHSR_AVG: 1
+        simpleqa_accuracy_given_attempted: 0
-    followbenchSSR_AVG: 1
+        chinese_simpleqa_given_attempted_accuracy: 1
-    followbenchHSR_L1: 1
+        alignment_bench_v1_1_专业能力: 7.90
-    followbenchHSR_L2: 1
+        alignment_bench_v1_1_数学计算: 0
-    followbenchHSR_L3: 1
+        alignment_bench_v1_1_基本任务: 0
-    followbenchHSR_L4: 1
+        alignment_bench_v1_1_逻辑推理: 0
-    followbenchHSR_L5: 1
+        alignment_bench_v1_1_中文理解: 0
-    followbenchSSR_L1: 1
+        alignment_bench_v1_1_文本写作: 0
-    followbenchSSR_L2: 1
+        alignment_bench_v1_1_角色扮演: 0
-    followbenchSSR_L3: 1
+        alignment_bench_v1_1_综合问答: 0
-    followbenchSSR_L4: 1
+        alpaca_eval_helpful_base: 20
-    followbenchSSR_L5: 1
+        compassarena_language_naive_average: 35
-    MTBench101average: 8.1
+        compassarena_knowledge_naive_average: 55
-    Wildbenchscore: -3.3333333333333335
+        compassarena_reason_v2_naive_average: 45.00
        compassarena_math_v2_naive_average: 55
        compassarena_creationv2_zh_naive_average: 30
        followbench_llmeval_en_HSR_AVG: 1
        followbench_llmeval_en_SSR_AVG: 1
        followbench_llmeval_en_HSR_L1: 1
        followbench_llmeval_en_HSR_L2: 1
        followbench_llmeval_en_HSR_L3: 1
        followbench_llmeval_en_HSR_L4: 1
        followbench_llmeval_en_HSR_L5: 1
        followbench_llmeval_en_SSR_L1: 1
        followbench_llmeval_en_SSR_L2: 1
        followbench_llmeval_en_SSR_L3: 1
        followbench_llmeval_en_SSR_L4: 1
        followbench_llmeval_en_SSR_L5: 1
        simpleqa_f1: 0
 internlm2_5-7b-chat-turbomind_fullbench:
-    race-high: 93.75
+    objective:
-    ARC-c: 87.5
+        race-high_accuracy:  93.75
-    BoolQ: 68.75
+        ARC-c_accuracy: 93.75
-    triviaqa_wiki_1shot: 50
+        BoolQ_accuracy: 68.75
-    nq_open_1shot: 25
+        triviaqa_wiki_1shot_score: 50
-    IFEval: 50
+        nq_open_1shot_score: 25
-    drop: 75
+        IFEval_Prompt-level-strict-accuracy: 56.25
-    hellaswag: 81.25
+        drop_accuracy: 81.25
-    TheoremQA: 6.25
+        GPQA_diamond_accuracy: 31.25
-    musr_average: 37.5
+        hellaswag_accuracy: 81.25
-    gsm8k: 68.75
+        TheoremQA_score: 6.25
-    math: 75
+        musr_average_naive_average: 39.58
-    GPQA_diamond: 25
+        korbench_single_naive_average: 37.50
-    cmo_fib: 6.25
+        gsm8k_accuracy: 68.75
-    aime2024: 6.25
+        math_accuracy: 68.75
-    wikibench-wiki-single_choice_cncircular: 25
+        cmo_fib_accuracy: 6.25
-    sanitized_mbpp: 68.75
+        aime2024_accuracy: 6.25
-    ds1000: 13.39
+        wikibench-wiki-single_choice_cncircular_perf_4: 50.00
-    lcb_code_generation: 12.5
+        sanitized_mbpp_score: 68.75
-    lcb_code_execution: 43.75
+        ds1000_naive_average: 16.96
-    lcb_test_output: 12.5
+        lcb_code_generation_pass@1: 12.5
-    bbh-logical_deduction_seven_objects: 56.25
+        lcb_code_execution_pass@1: 43.75
-    bbh-multistep_arithmetic_two: 68.75
+        lcb_test_output_pass@1: 25.00
-    mmlu-other: 74.04
+        bbh-logical_deduction_seven_objects_score: 50.00
-    cmmlu-china-specific: 76.25
+        bbh-multistep_arithmetic_two_score: 68.75
-    mmlu_pro_math: 25
+        mmlu-other_naive_average: 69.71
-    ds1000_Pandas: 0
+        cmmlu-china-specific_naive_average: 75.83
-    ds1000_Numpy: 0
+        mmlu_pro_math_accuracy: 31.25
-    ds1000_Tensorflow: 12.5
+        ds1000_Pandas_accuracy: 0
-    ds1000_Scipy: 18.75
+        ds1000_Numpy_accuracy: 0
-    ds1000_Sklearn: 18.75
+        ds1000_Tensorflow_accuracy: 12.5
-    ds1000_Pytorch: 6.25
+        ds1000_Scipy_accuracy: 18.75
-    ds1000_Matplotlib: 37.5
+        ds1000_Sklearn_accuracy: 18.75
-    openai_mmmlu_lite_AR-XY: 37.5
+        ds1000_Pytorch_accuracy: 18.75
-    college: 0
+        ds1000_Matplotlib_accuracy: 50.00
-    college_knowledge: 87.5
+        openai_mmmlu_lite_AR-XY_accuracy: 37.5
-    Alignbench总分: 0.64
+        college_naive_average: 12.50
-    Alignbench专业能力: 7.6
+        college_knowledge_naive_average: 87.5
-    AlpacaEvaltotal: 10
+    subjective:
-    AlpacaEvalhelpful_base: 10
+        alignment_bench_v1_1_总分: 0.70
-    CompassArenacompassarena_language: 59
+        alpaca_eval_total: 0
-    CompassArenacompassarena_knowledge: 57
+        arenahard_score: 50
-    CompassArenacompassarena_reason_v2: 49.5
+        Followbench_naive_average: 1
-    CompassArenacompassarena_math_v2: 51
+        CompassArena_naive_average: 38
-    CompassArenacompassarena_creationv2_zh: 43.75
+        mtbench101_avg: 7.80
-    Fofofofo_test_prompts: 1
+        wildbench_average: -4.86
-    followbenchHSR_AVG: 1
+        simpleqa_accuracy_given_attempted: 0
-    followbenchSSR_AVG: 1
+        chinese_simpleqa_given_attempted_accuracy: 1
-    followbenchHSR_L1: 1
+        alignment_bench_v1_1_专业能力: 8.4
-    followbenchHSR_L2: 1
+        alignment_bench_v1_1_数学计算: 0
-    followbenchHSR_L3: 1
+        alignment_bench_v1_1_基本任务: 0
-    followbenchHSR_L4: 1
+        alignment_bench_v1_1_逻辑推理: 0
-    followbenchHSR_L5: 1
+        alignment_bench_v1_1_中文理解: 0
-    followbenchSSR_L1: 1
+        alignment_bench_v1_1_文本写作: 0
-    followbenchSSR_L2: 1
+        alignment_bench_v1_1_角色扮演: 0
-    followbenchSSR_L3: 1
+        alignment_bench_v1_1_综合问答: 0
-    followbenchSSR_L4: 1
+        alpaca_eval_helpful_base: 0
-    followbenchSSR_L5: 1
+        compassarena_language_naive_average: 35
-    MTBench101average: 8.1
+        compassarena_knowledge_naive_average: 50
-    Wildbenchscore: -8.333333333333334
+        compassarena_reason_v2_naive_average: 30
        compassarena_math_v2_naive_average: 50
        compassarena_creationv2_zh_naive_average: 25
        followbench_llmeval_en_HSR_AVG: 1
        followbench_llmeval_en_SSR_AVG: 1
        followbench_llmeval_en_HSR_L1: 1
        followbench_llmeval_en_HSR_L2: 1
        followbench_llmeval_en_HSR_L3: 1
        followbench_llmeval_en_HSR_L4: 1
        followbench_llmeval_en_HSR_L5: 1
        followbench_llmeval_en_SSR_L1: 1
        followbench_llmeval_en_SSR_L2: 1
        followbench_llmeval_en_SSR_L3: 1
        followbench_llmeval_en_SSR_L4: 1
        followbench_llmeval_en_SSR_L5: 1
        simpleqa_f1: 0
 internlm2_5-7b-hf_fullbench:
-    race-high: 100
+    objective:
-    ARC-c: 68.75
+        race-high_accuracy: 100
-    BoolQ: 87.5
+        ARC-c_accuracy: 68.75
-    GPQA_diamond: 62.5
+        BoolQ_accuracy: 87.5
-    drop: 62.5
+        triviaqa_wiki_1shot_score: 43.75
-    math: 12.5
+        nq_open_1shot_score: 43.75
-    wikibench-wiki-single_choice_cncircular: 25
+        drop_accuracy: 62.5
-    sanitized_mbpp: 56.25
+        GPQA_diamond_accuracy: 62.5
-    gsm8k: 37.5
+        hellaswag_accuracy: 93.75
-    triviaqa_wiki_1shot: 43.75
+        TheoremQA_score: 25
-    nq_open_1shot: 43.75
+        winogrande_accuracy: 75
-    winogrande: 75
+        gsm8k_accuracy: 37.5
-    hellaswag: 93.75
+        GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
-    TheoremQA: 25
+        GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
-    dingo_en_192: 37.5
+        math_accuracy: 12.5
-    dingo_zh_170: 100
+        wikibench-wiki-single_choice_cncircular_perf_4: 25
-    college: 12.5
+        sanitized_mbpp_score: 56.25
-    college_knowledge: 87.5
+        dingo_en_192_score: 37.5
-    bbh-logical_deduction_seven_objects: 43.75
+        dingo_zh_170_score: 100
-    bbh-multistep_arithmetic_two: 56.25
+        mmlu-other_accuracy: 76.92
-    mmlu-other: 76.92
+        cmmlu-china-specific_accuracy: 84.17
-    cmmlu-china-specific: 84.17
+        mmlu_pro_math_accuracy: 18.75
-    mmlu_pro_math: 18.75
+        bbh-logical_deduction_seven_objects_score: 43.75
        bbh-multistep_arithmetic_two_score: 56.25
        college_naive_average: 12.5
        college_knowledge_naive_average: 87.5
 internlm2_5-7b-turbomind_fullbench:
-    race-high: 100
+    objective:
-    ARC-c: 68.75
+        race-high_accuracy: 100
-    BoolQ: 87.5
+        ARC-c_accuracy: 68.75
-    GPQA_diamond: 62.5
+        BoolQ_accuracy: 87.5
-    drop: 62.5
+        triviaqa_wiki_1shot_score: 43.75
-    math: 18.75
+        nq_open_1shot_score: 43.75
-    wikibench-wiki-single_choice_cncircular: 25
+        drop_accuracy: 62.5
-    sanitized_mbpp: 56.25
+        GPQA_diamond_accuracy: 62.5
-    gsm8k: 68.75
+        hellaswag_accuracy: 93.75
-    triviaqa_wiki_1shot: 43.75
+        TheoremQA_score: 25.00
-    nq_open_1shot: 43.75
+        winogrande_accuracy: 87.5
-    winogrande: 87.5
+        gsm8k_accuracy: 62.50
-    hellaswag: 93.75
+        GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
-    TheoremQA: 31.25
+        GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
-    dingo_en_192: 43.75
+        math_accuracy: 18.75
-    dingo_zh_170: 100
+        wikibench-wiki-single_choice_cncircular_perf_4: 25
-    college: 12.5
+        sanitized_mbpp_score: 62.50
-    college_knowledge: 87.5
+        dingo_en_192_score: 31.25
-    bbh-logical_deduction_seven_objects: 50
+        dingo_zh_170_score: 93.75
-    bbh-multistep_arithmetic_two: 56.25
+        mmlu-other_accuracy: 76.92
-    mmlu-other: 76.92
+        cmmlu-china-specific_accuracy: 84.17
-    cmmlu-china-specific: 84.17
+        mmlu_pro_math_accuracy: 18.75
-    mmlu_pro_math: 18.75
+        bbh-logical_deduction_seven_objects_score: 50
        bbh-multistep_arithmetic_two_score: 56.25
        college_naive_average: 12.5
        college_knowledge_naive_average: 87.5
 internlm2_5-7b-turbomind:
    objective:
        race-high_accuracy: 89.28
        ARC-c_accuracy: 52.2
        BoolQ_accuracy: 89.72
        triviaqa_wiki_1shot_score: 65.88
        nq_open_1shot_score: 34.82
        drop_accuracy: 68.1
        bbh_naive_average: 72.15
        GPQA_diamond_accuracy: 32.83
        hellaswag_accuracy: 88.36
        TheoremQA_score: 25
        winogrande_accuracy: 81.29
        gsm8k_accuracy: 74.68
        GaokaoBench_weighted_average: 58.19
        math_accuracy: 33.98
        Mathbench_naive_average: 48.38
        wikibench-wiki-single_choice_cncircular_perf_4: 29.1
        cmmlu_naive_average: 78.94
        mmlu_naive_average: 71.44
        mmlu_pro_naive_average: 38.18
        openai_humaneval_humaneval_pass@1: 59.76
        openai_humaneval_v2_humaneval_pass@1: 51.22
        sanitized_mbpp_score: 55.25
        dingo_en_192_score: 60.94
        dingo_zh_170_score: 67.65
        mmlu-stem_naive_average: 63.72
        mmlu-social-science_naive_average: 80.15
        mmlu-humanities_naive_average: 74.27
        mmlu-other_naive_average: 71.85
        cmmlu-stem_naive_average: 67.07
        cmmlu-social-science_naive_average: 81.49
        cmmlu-humanities_naive_average: 85.84
        cmmlu-other_naive_average: 82.69
        cmmlu-china-specific_naive_average: 79.88
        mmlu_pro_biology_accuracy: 58.58
        mmlu_pro_business_accuracy: 28.01
        mmlu_pro_chemistry_accuracy: 22.79
        mmlu_pro_computer_science_accuracy: 39.02
        mmlu_pro_economics_accuracy: 53.08
        mmlu_pro_engineering_accuracy: 25.7
        mmlu_pro_health_accuracy: 46.94
        mmlu_pro_history_accuracy: 43.04
        mmlu_pro_law_accuracy: 29.7
        mmlu_pro_math_accuracy: 24.2
        mmlu_pro_philosophy_accuracy: 42.48
        mmlu_pro_physics_accuracy: 26.02
        mmlu_pro_psychology_accuracy: 52.76
        mmlu_pro_other_accuracy: 42.21
        college_naive_average: 10.67
        high_naive_average: 6.67
        middle_naive_average: 26.67
        primary_naive_average: 60
        arithmetic_naive_average: 55
        mathbench-a (average)_naive_average: 31.8
        college_knowledge_naive_average: 62.34
        high_knowledge_naive_average: 59.83
        middle_knowledge_naive_average: 71.15
        primary_knowledge_naive_average: 66.55
        mathbench-t (average)_naive_average: 64.97
    long_context:
        Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
        Single-Needle-Retrieval-EN-32000_naive_average: 100
        Single-Needle-Retrieval-ZH-32000_naive_average: 100
        Single-Needle-Retrieval(S-RT)-100000_naive_average: 100
        Single-Needle-Retrieval-EN-100000_naive_average: 100
        Single-Needle-Retrieval-ZH-100000_naive_average: 100
        Single-Needle-Retrieval(S-RT)-200000_naive_average: 100
        Single-Needle-Retrieval-EN-200000_naive_average: 100
        Single-Needle-Retrieval-ZH-200000_naive_average: 100
        longbench_naive_average: 46.19
        longbench_zh_naive_average: 49.3
        longbench_en_naive_average: 43.97
        longbench_single-document-qa_naive_average: 42.84
        longbench_multi-document-qa_naive_average: 37.29
        longbench_summarization_naive_average: 23.21
        longbench_few-shot-learning_naive_average: 61.67
        longbench_synthetic-tasks_naive_average: 60.05
        longbench_code-completion_naive_average: 52.09
 internlm2_5-7b-chat-turbomind:
    objective:
        race-high_accuracy: 86.16
        ARC-c_accuracy: 90.17
        BoolQ_accuracy: 87.89
        triviaqa_wiki_1shot_score: 64.91
        nq_open_1shot_score: 22.69
        mmmlu_lite_naive_average: 44.96
        IFEval_Prompt-level-strict-accuracy: 58.04
        drop_accuracy: 77.68
        bbh_naive_average: 73.14
        GPQA_diamond_accuracy: 25.76
        hellaswag_accuracy: 94.79
        TheoremQA_score: 21.5
        musr_average_naive_average: 51.03
        korbench_single_naive_average: 31.92
        ARC_Prize_Public_Evaluation_accuracy: 0.01
        gsm8k_accuracy: 86.73
        GaokaoBench_weighted_average: 77.89
        math_accuracy: 61.5
        cmo_fib_accuracy: 12.5
        aime2024_accuracy: 3.33
        Mathbench_naive_average: 65.17
        wikibench-wiki-single_choice_cncircular_perf_4: 31.55
        cmmlu_naive_average: 74.14
        mmlu_naive_average: 70.52
        mmlu_pro_naive_average: 44.98
        openai_humaneval_humaneval_pass@1: 70.73
        sanitized_mbpp_score: 63.81
        humanevalx_naive_average: 38.17
        ds1000_naive_average: 14.15
        lcb_code_generation_pass@1: 17.75
        lcb_code_execution_pass@1: 32.57
        lcb_test_output_pass@1: 24.89
        bigcodebench_hard_instruct_pass@1: 0.08
        bigcodebench_hard_complete_pass@1: 0.06
        teval_naive_average: 80.03
        qa_dingo_cn_score: 99.01
        mmlu-stem_naive_average: 68.2
        mmlu-social-science_naive_average: 76.11
        mmlu-humanities_naive_average: 68.71
        mmlu-other_naive_average: 70.56
        cmmlu-stem_naive_average: 66.27
        cmmlu-social-science_naive_average: 75.7
        cmmlu-humanities_naive_average: 77.7
        cmmlu-other_naive_average: 77.71
        cmmlu-china-specific_naive_average: 72.94
        mmlu_pro_biology_accuracy: 66.25
        mmlu_pro_business_accuracy: 48.42
        mmlu_pro_chemistry_accuracy: 35.25
        mmlu_pro_computer_science_accuracy: 47.56
        mmlu_pro_economics_accuracy: 55.92
        mmlu_pro_engineering_accuracy: 30.44
        mmlu_pro_health_accuracy: 45.97
        mmlu_pro_history_accuracy: 41.21
        mmlu_pro_law_accuracy: 25.79
        mmlu_pro_math_accuracy: 54.03
        mmlu_pro_philosophy_accuracy: 36.47
        mmlu_pro_physics_accuracy: 37.41
        mmlu_pro_psychology_accuracy: 58.77
        mmlu_pro_other_accuracy: 46.21
        humanevalx-python_pass@1: 53.66
        humanevalx-cpp_pass@1: 24.39
        humanevalx-go_pass@1: 0
        humanevalx-java_pass@1: 57.93
        humanevalx-js_pass@1: 54.88
        ds1000_Pandas_accuracy: 12.03
        ds1000_Numpy_accuracy: 4.09
        ds1000_Tensorflow_accuracy: 11.11
        ds1000_Scipy_accuracy: 8.49
        ds1000_Sklearn_accuracy: 6.96
        ds1000_Pytorch_accuracy: 7.35
        ds1000_Matplotlib_accuracy: 49.03
        openai_mmmlu_lite_AR-XY_accuracy: 17.89
        openai_mmmlu_lite_BN-BD_accuracy: 27.58
        openai_mmmlu_lite_DE-DE_accuracy: 51.16
        openai_mmmlu_lite_ES-LA_accuracy: 56.84
        openai_mmmlu_lite_FR-FR_accuracy: 57.96
        openai_mmmlu_lite_HI-IN_accuracy: 33.68
        openai_mmmlu_lite_ID-ID_accuracy: 51.02
        openai_mmmlu_lite_IT-IT_accuracy: 50.46
        openai_mmmlu_lite_JA-JP_accuracy: 50.53
        openai_mmmlu_lite_KO-KR_accuracy: 45.05
        openai_mmmlu_lite_PT-BR_accuracy: 57.68
        openai_mmmlu_lite_SW-KE_accuracy: 32.77
        openai_mmmlu_lite_YO-NG_accuracy: 31.79
        openai_mmmlu_lite_ZH-CN_accuracy: 65.05
        college_naive_average: 20.33
        high_naive_average: 47.67
        middle_naive_average: 62
        primary_naive_average: 72
        arithmetic_naive_average: 62.33
        mathbench-a (average)_naive_average: 52.87
        college_knowledge_naive_average: 70.57
        high_knowledge_naive_average: 70.13
        middle_knowledge_naive_average: 81.17
        primary_knowledge_naive_average: 88.01
        mathbench-t (average)_naive_average: 77.47
    subjective:
        alignment_bench_v1_1_总分: 5.68
        alpaca_eval_total: 25.96
        arenahard_score: 17.15
        Followbench_naive_average: 0.81
        CompassArena_naive_average: 34.61
        FoFo_naive_average: 0.38
        mtbench101_avg: 8.01
        wildbench_average: -15.69
        simpleqa_accuracy_given_attempted: 0.04
        chinese_simpleqa_given_attempted_accuracy: 0.34
        alignment_bench_v1_1_专业能力: 6.05
        alignment_bench_v1_1_数学计算: 5.87
        alignment_bench_v1_1_基本任务: 6.01
        alignment_bench_v1_1_逻辑推理: 4.48
        alignment_bench_v1_1_中文理解: 6.17
        alignment_bench_v1_1_文本写作: 6.06
        alignment_bench_v1_1_角色扮演: 6.3
        alignment_bench_v1_1_综合问答: 6.45
        alpaca_eval_helpful_base: 17.83
        alpaca_eval_koala: 28.21
        alpaca_eval_oasst: 23.4
        alpaca_eval_selfinstruct: 30.95
        alpaca_eval_vicuna: 25
        compassarena_language_naive_average: 52.5
        compassarena_knowledge_naive_average: 36
        compassarena_reason_v2_naive_average: 35
        compassarena_math_v2_naive_average: 19.91
        compassarena_creationv2_zh_naive_average: 29.64
        fofo_test_prompts_overall: 0.35
        fofo_test_prompts_cn_overall: 0.41
        followbench_llmeval_en_HSR_AVG: 0.73
        followbench_llmeval_en_SSR_AVG: 0.88
        followbench_llmeval_en_HSR_L1: 0.94
        followbench_llmeval_en_HSR_L2: 0.77
        followbench_llmeval_en_HSR_L3: 0.73
        followbench_llmeval_en_HSR_L4: 0.68
        followbench_llmeval_en_HSR_L5: 0.54
        followbench_llmeval_en_SSR_L1: 0.94
        followbench_llmeval_en_SSR_L2: 0.88
        followbench_llmeval_en_SSR_L3: 0.87
        followbench_llmeval_en_SSR_L4: 0.87
        followbench_llmeval_en_SSR_L5: 0.85
        simpleqa_f1: 0.04
 internlm2_5-7b-chat-1m-turbomind:
    long_context:
        ruler_8k_naive_average: 88.53
        ruler_32k_naive_average: 83.84
        ruler_128k_naive_average: 70.94
        NeedleBench-Overall-Score-8K_weighted_average: 91.89
        NeedleBench-Overall-Score-32K_weighted_average: 91.42
        NeedleBench-Overall-Score-128K_weighted_average: 88.57
        longbench_naive_average: 46.44
        longbench_zh_naive_average: 45.19
        longbench_en_naive_average: 45.71
        babilong_0k_naive_average: 79.3
        babilong_4k_naive_average: 67
        babilong_16k_naive_average: 52.7
        babilong_32k_naive_average: 48.9
        babilong_128k_naive_average: 40.8
        babilong_256k_naive_average: 23.5
        longbench_single-document-qa_naive_average: 43.56
        longbench_multi-document-qa_naive_average: 46.24
        longbench_summarization_naive_average: 24.32
        longbench_few-shot-learning_naive_average: 51.67
        longbench_synthetic-tasks_naive_average: 66.83
        longbench_code-completion_naive_average: 45.99
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@ -1,459 +1,468 @@
-baichuan2-7b-chat-hf:
+chat:
-    gsm8k: 18.75
+    glm-4-9b-chat-hf:
-    race-high: 78.12
+        gsm8k_accuracy: 68.75
-
+        race-high_accuracy: 90.62
-glm-4-9b-chat-hf:
+    glm-4-9b-chat-turbomind:
-    gsm8k: 68.75
+        gsm8k_accuracy: 71.88
-    race-high: 90.62
+        race-high_accuracy: 90.62
-
+    glm-4-9b-chat-vllm:
-glm-4-9b-chat-turbomind:
+        gsm8k_accuracy: 65.62
-    gsm8k: 75.00
+        race-high_accuracy: 90.62
-    race-high: 90.62
+    deepseek-7b-chat-hf:
-
+        gsm8k_accuracy: 46.88
-glm-4-9b-chat-vllm:
+        race-high_accuracy: 81.25
-    gsm8k: 65.62
+    deepseek-moe-16b-chat-hf:
-    race-high: 90.62
+        gsm8k_accuracy: 50
-
+        race-high_accuracy: 68.75
-deepseek-7b-chat-hf:
+    deepseek-7b-chat-vllm:
-    gsm8k: 46.88
+        gsm8k_accuracy: 43.75
-    race-high: 81.25
+        race-high_accuracy: 75
-
+    gemma2-2b-it-hf:
-deepseek-moe-16b-chat-hf:
+        gsm8k_accuracy: 50
-    gsm8k: 50
+        race-high_accuracy: 71.88
-    race-high: 68.75
+    gemma2-9b-it-hf:
-
+        gsm8k_accuracy: 71.88
-deepseek-7b-chat-vllm:
+        race-high_accuracy: 84.38
-    gsm8k: 43.75
+    gemma-2b-it-hf:
-    race-high: 75
+        gsm8k_accuracy: 3.12
-
+        race-high_accuracy: 40.62
-gemma2-2b-it-hf:
+    gemma-7b-it-hf:
-    gsm8k: 50
+        gsm8k_accuracy: 40.62
-    race-high: 71.88
+        race-high_accuracy: 68.75
-
+    gemma-2-9b-it-turbomind:
-gemma2-9b-it-hf:
+        gsm8k_accuracy: 71.88
-    gsm8k: 71.88
+        race-high_accuracy: 84.38
-    race-high: 84.38
+    gemma-2-27b-it-turbomind:
-
+        gsm8k_accuracy: 78.12
-gemma-2b-it-hf:
+        race-high_accuracy: 93.75
-    gsm8k: 3.12
+    gemma-7b-it-vllm:
-    race-high: 40.62
+        gsm8k_accuracy: 34.38
-
+        race-high_accuracy: 68.75
-gemma-7b-it-hf:
+    internlm2_5-7b-chat-hf:
-    gsm8k: 40.62
+        gsm8k_accuracy: 84.38
-    race-high: 68.75
+        race-high_accuracy: 90.62
-
+    internlm2_5-7b-chat-turbomind:
-gemma-2-9b-it-turbomind:
+        gsm8k_accuracy: 87.50
-    gsm8k: 65.62
+        race-high_accuracy: 90.62
-    race-high: 84.38
+    internlm2-chat-1.8b-turbomind:
-
+        gsm8k_accuracy: 28.12
-gemma-7b-it-vllm:
+        race-high_accuracy: 84.38
-    gsm8k: 34.38
+    internlm2-chat-1.8b-sft-turbomind:
-    race-high: 68.75
+        gsm8k_accuracy: 21.88
-
+        race-high_accuracy: 84.38
-internlm2_5-7b-chat-hf:
+    internlm2-chat-7b-lmdeploy:
-    gsm8k: 84.38
+        gsm8k_accuracy: 53.12
-    race-high: 90.62
+        race-high_accuracy: 84.38
-
+    internlm2-chat-7b-sft-turbomind:
-internlm2_5-7b-chat-turbomind:
+        gsm8k_accuracy: 53.12
-    gsm8k: 84.38
+        race-high_accuracy: 90.62
-    race-high: 90.62
+    internlm2-chat-7b-vllm:
-
+        gsm8k_accuracy: 56.25
-internlm2-chat-1.8b-turbomind:
+        race-high_accuracy: 84.38
-    gsm8k: 25
+    llama-3_1-8b-instruct-hf:
-    race-high: 84.38
+        gsm8k_accuracy: 84.38
-
+        race-high_accuracy: 90.62
-internlm2-chat-1.8b-sft-turbomind:
+    llama-3_2-3b-instruct-hf:
-    gsm8k: 21.88
+        gsm8k_accuracy: 68.75
-    race-high: 84.38
+        race-high_accuracy: 81.25
-
+    llama-3-8b-instruct-hf:
-internlm2-chat-7b-lmdeploy:
+        gsm8k_accuracy: 68.75
-    gsm8k: 53.12
+        race-high_accuracy: 87.5
-    race-high: 84.38
+    llama-2-7b-chat-turbomind:
-
+        gsm8k_accuracy: 18.75
-internlm2-chat-7b-sft-turbomind:
+        race-high_accuracy: 46.88
-    gsm8k: 50
+    llama-3_1-8b-instruct-turbomind:
-    race-high: 90.62
+        gsm8k_accuracy: 78.12
-
+        race-high_accuracy: 90.62
-internlm2-chat-7b-vllm:
+    llama-3_2-3b-instruct-turbomind:
-    gsm8k: 43.75
+        gsm8k_accuracy: 71.88
-    race-high: 87.5
+        race-high_accuracy: 81.25
-
+    llama-3-8b-instruct-turbomind:
-llama-3_1-8b-instruct-hf:
+        gsm8k_accuracy: 71.88
-    gsm8k: 84.38
+        race-high_accuracy: 87.5
-    race-high: 90.62
+    mistral-7b-instruct-v0.2-hf:
-
+        gsm8k_accuracy: 40.62
-llama-3_2-3b-instruct-hf:
+        race-high_accuracy: 75
-    gsm8k: 65.62
+    mistral-7b-instruct-v0.3-hf:
-    race-high: 81.25
+        gsm8k_accuracy: 40.62
-
+        race-high_accuracy: 75
-llama-3-8b-instruct-hf:
+    mistral-nemo-instruct-2407-hf:
-    gsm8k: 68.75
+        gsm8k_accuracy: 75
-    race-high: 87.5
+        race-high_accuracy: 81.25
-
+    mistral-nemo-instruct-2407-turbomind:
-llama-3_1-8b-instruct-turbomind:
+        gsm8k_accuracy: 65.62
-    gsm8k: 78.12
+        race-high_accuracy: 87.50
-    race-high: 90.62
+    mistral-7b-instruct-v0.1-vllm:
-
+        gsm8k_accuracy: 34.38
-llama-3_2-3b-instruct-turbomind:
+        race-high_accuracy: 68.75
-    gsm8k: 62.50
+    mistral-7b-instruct-v0.2-vllm:
-    race-high: 81.25
+        gsm8k_accuracy: 43.75
-
+        race-high_accuracy: 75
-llama-3-8b-instruct-turbomind:
+    MiniCPM3-4B-hf:
-    gsm8k: 68.75
+        gsm8k_accuracy: 68.75
-    race-high: 87.5
+        race-high_accuracy: 84.38
-
+    phi-3-mini-4k-instruct-hf:
-mistral-7b-instruct-v0.2-hf:
+        gsm8k_accuracy: 56.25
-    gsm8k: 40.62
+        race-high_accuracy: 84.38
-    race-high: 75
+    phi-3-small-8k-instruct-hf:
-
+        gsm8k_accuracy: 0
-mistral-7b-instruct-v0.3-hf:
+        race-high_accuracy: 0
-    gsm8k: 40.62
+    qwen2.5-0.5b-instruct-hf:
-    race-high: 75
+        gsm8k_accuracy: 34.38
-
+        race-high_accuracy: 46.88
-mistral-nemo-instruct-2407-hf:
+    qwen2.5-3b-instruct-hf :
-    gsm8k: 75
+        gsm8k_accuracy: 53.12
-    race-high: 81.25
+        race-high_accuracy: 90.62
-
+    qwen2.5-0.5b-instruct-turbomind:
-mistral-nemo-instruct-2407-turbomind:
+        gsm8k_accuracy: 28.12
-    gsm8k: 68.75
+        race-high_accuracy: 50
-    race-high: 87.50
+    qwen2.5-3b-instruct-turbomind:
-
+        gsm8k_accuracy: 59.38
-mistral-7b-instruct-v0.1-vllm:
+        race-high_accuracy: 90.62
-    gsm8k: 34.38
+    qwen1.5-0.5b-chat-hf:
-    race-high: 68.75
+        gsm8k_accuracy: 0
-
+        race-high_accuracy: 53.12
-mistral-7b-instruct-v0.2-vllm:
+    qwen2-1.5b-instruct-hf:
-    gsm8k: 43.75
+        gsm8k_accuracy: 62.5
-    race-high: 75
+        race-high_accuracy: 84.38
-
+    qwen2-7b-instruct-hf:
-MiniCPM3-4B-hf:
+        gsm8k_accuracy: 68.75
-    gsm8k: 68.75
+        race-high_accuracy: 90.62
-    race-high: 84.38
+    qwen2-1.5b-instruct-turbomind:
-
+        gsm8k_accuracy: 53.12
-minicpm-2b-dpo-fp32-hf:
+        race-high_accuracy: 84.38
-    gsm8k: 56.25
+    qwen2-7b-instruct-turbomind:
-    race-high: 53.12
+        gsm8k_accuracy: 81.25
-
+        race-high_accuracy: 90.62
-minicpm-2b-sft-bf16-hf:
+    qwen1.5-0.5b-chat-vllm:
-    gsm8k: 46.88
+        gsm8k_accuracy: 3.12
-    race-high: 65.62
+        race-high_accuracy: 53.12
-
+    yi-1.5-6b-chat-hf:
-minicpm-2b-sft-fp32-hf:
+        gsm8k_accuracy: 65.62
-    gsm8k: 46.88
+        race-high_accuracy: 84.38
-    race-high: 65.62
+    yi-1.5-9b-chat-hf:
-
+        gsm8k_accuracy: 75
-phi-3-mini-4k-instruct-hf:
+        race-high_accuracy: 93.75
-    gsm8k: 56.25
+    yi-1.5-6b-chat-turbomind:
-    race-high: 84.38
+        gsm8k_accuracy: 62.5
-
+        race-high_accuracy: 84.38
-qwen1.5-0.5b-chat-hf:
+    yi-1.5-9b-chat-turbomind:
-    gsm8k: 0
+        gsm8k_accuracy: 71.88
-    race-high: 53.12
+        race-high_accuracy: 93.75
-
+    deepseek-v2-lite-chat-hf:
-qwen2-1.5b-instruct-hf:
+        gsm8k_accuracy: 46.88
-    gsm8k: 62.5
+        race-high_accuracy: 71.88
-    race-high: 84.38
+    gemma2-27b-it-hf:
-
+        gsm8k_accuracy: 75
-qwen2-7b-instruct-hf:
+        race-high_accuracy: 93.75
-    gsm8k: 68.75
+    internlm2_5-20b-chat-hf:
-    race-high: 90.62
+        gsm8k_accuracy: 84.38
-
+        race-high_accuracy: 87.5
-qwen2-1.5b-instruct-turbomind:
+    internlm2_5-20b-chat-turbomind:
-    gsm8k: 62.50
+        gsm8k_accuracy: 87.50
-    race-high: 84.38
+        race-high_accuracy: 87.5
-
+    mistral-small-instruct-2409-hf:
-qwen2-7b-instruct-turbomind:
+        gsm8k_accuracy: 81.25
-    gsm8k: 81.25
+        race-high_accuracy: 87.50
-    race-high: 87.5
+    mistral-small-instruct-2409-turbomind:
-
+        gsm8k_accuracy: 81.25
-qwen1.5-0.5b-chat-vllm:
+        race-high_accuracy: 87.50
-    gsm8k: 3.12
+    qwen2.5-14b-instruct-hf:
-    race-high: 53.12
+        gsm8k_accuracy: 71.88
-
+        race-high_accuracy: 96.88
-yi-1.5-6b-chat-hf:
+    qwen2.5-14b-instruct-turbomind:
-    gsm8k: 65.62
+        gsm8k_accuracy: 68.75
-    race-high: 84.38
+        race-high_accuracy: 93.75
-
+    yi-1.5-34b-chat-turbomind:
-yi-1.5-9b-chat-hf:
+        gsm8k_accuracy: 78.12
-    gsm8k: 75
+        race-high_accuracy: 93.75
-    race-high: 93.75
+    deepseek-67b-chat-hf:
-
+        gsm8k_accuracy: 71.88
-deepseek-v2-lite-chat-hf:
+        race-high_accuracy: 78.12
-    gsm8k: 43.75
+    llama-3_3-70b-instruct-turbomind:
-    race-high: 71.88
+        gsm8k_accuracy: 93.75
-
+        race-high_accuracy: 87.5
-internlm2_5-20b-chat-hf:
+    mixtral-8x7b-instruct-v0.1-hf:
-    gsm8k: 84.38
+        gsm8k_accuracy: 56.25
-    race-high: 87.5
+        race-high_accuracy: 81.25
-
+    mixtral-large-instruct-2411-turbomind:
-internlm2_5-20b-chat-turbomind:
+        gsm8k_accuracy: 90.62
-    gsm8k: 84.38
+        race-high_accuracy: 93.75
-    race-high: 87.5
+    nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
-
+        gsm8k_accuracy: 87.5
-mistral-small-instruct-2409-hf:
+        race-high_accuracy: 46.88
-    gsm8k: 81.25
+    qwen2.5-72b-instruct-turbomind:
-    race-high: 87.50
+        gsm8k_accuracy: 75
-
+        race-high_accuracy: 93.75
-mistral-small-instruct-2409-turbomind:
+    deepseek-v2_5-1210-turbomind:
-    gsm8k: 78.12
+        gsm8k_accuracy: 90.62
-    race-high: 87.50
+        race-high_accuracy: 84.38
-
+    mixtral-8x22b-instruct-v0.1-hf:
-qwen2.5-14b-instruct-hf:
+        gsm8k_accuracy: 81.25
-    gsm8k: 71.88
+        race-high_accuracy: 81.25
-    race-high: 96.88
+base:
-
+    glm-4-9b-hf:
-qwen2.5-14b-instruct-turbomind:
+        gsm8k_accuracy: 68.75
-    gsm8k: 71.88
+        GPQA_diamond_accuracy: 31.25
-    race-high: 93.75
+        race-high_accuracy: 93.75
-
+        winogrande_accuracy: 84.38
-glm-4-9b-hf:
+    glm-4-9b-turbomind:
-    gsm8k: 68.75
+        gsm8k_accuracy: 62.5
-    GPQA_diamond: 31.25
+        GPQA_diamond_accuracy: 28.12
-    race-high: 93.75
+        race-high_accuracy: 93.75
-    winogrande: 84.38
+        winogrande_accuracy: 84.38
-
+    deepseek-7b-base-hf:
-deepseek-moe-16b-base-hf:
+        gsm8k_accuracy: 25
-    gsm8k: 21.88
+        GPQA_diamond_accuracy: 0
-    GPQA_diamond: 0
+        race-high_accuracy: 46.88
-    race-high: 21.88
+        winogrande_accuracy: 71.88
-    winogrande: 65.62
+    deepseek-moe-16b-base-hf:
-
+        gsm8k_accuracy: 21.88
-deepseek-7b-base-turbomind:
+        GPQA_diamond_accuracy: 0
-    gsm8k: 21.88
+        race-high_accuracy: 21.88
-    GPQA_diamond: 0
+        winogrande_accuracy: 65.62
-    race-high: 46.88
+    deepseek-7b-base-turbomind:
-    winogrande: 84.38
+        gsm8k_accuracy: 21.88
-
+        GPQA_diamond_accuracy: 0
-deepseek-moe-16b-base-vllm:
+        race-high_accuracy: 46.88
-    gsm8k: 21.88
+        winogrande_accuracy: 84.38
-    GPQA_diamond: 0
+    deepseek-moe-16b-base-vllm:
-    race-high: 25
+        gsm8k_accuracy: 21.88
-    winogrande: 68.75
+        GPQA_diamond_accuracy: 0
-
+        race-high_accuracy: 25
-gemma2-2b-hf:
+        winogrande_accuracy: 68.75
-    gsm8k: 31.25
+    gemma2-2b-hf:
-    GPQA_diamond: 3.12
+        gsm8k_accuracy: 28.12
-    race-high: 56.25
+        GPQA_diamond_accuracy: 3.12
-    winogrande: 71.88
+        race-high_accuracy: 56.25
-
+        winogrande_accuracy: 71.88
-gemma2-9b-hf:
+    gemma2-9b-hf:
-    gsm8k: 68.75
+        gsm8k_accuracy: 68.75
-    GPQA_diamond: 0
+        GPQA_diamond_accuracy: 0
-    race-high: 81.25
+        race-high_accuracy: 81.25
-    winogrande: 84.38
+        winogrande_accuracy: 84.38
-
+    gemma-2b-hf:
-gemma-2b-hf:
+        gsm8k_accuracy: 18.75
-    gsm8k: 18.75
+        GPQA_diamond_accuracy: 3.12
-    GPQA_diamond: 3.12
+        race-high_accuracy: 25
-    race-high: 25
+        winogrande_accuracy: 53.12
-    winogrande: 53.12
+    gemma-7b-hf:
-
+        gsm8k_accuracy: 56.25
-gemma-7b-hf:
+        GPQA_diamond_accuracy: 6.25
-    gsm8k: 56.25
+        race-high_accuracy: 65.62
-    GPQA_diamond: 6.25
+        winogrande_accuracy: 78.12
-    race-high: 65.62
+    gemma-2b-vllm:
-    winogrande: 78.12
+        gsm8k_accuracy: 15.62
-
+        GPQA_diamond_accuracy: 3.12
-gemma-2b-vllm:
+        race-high_accuracy:
-    gsm8k: 15.62
+        winogrande_accuracy:
-    GPQA_diamond: 6.25
+    gemma-7b-vllm:
-    race-high:
+        gsm8k_accuracy: 53.12
-    winogrande:
+        GPQA_diamond_accuracy: 9.38
-
+        race-high_accuracy:
-gemma-7b-vllm:
+        winogrande_accuracy:
-    gsm8k: 53.12
+    internlm2_5-7b-hf:
-    GPQA_diamond: 6.25
+        gsm8k_accuracy: 37.5
-    race-high:
+        GPQA_diamond_accuracy: 25
-    winogrande:
+        race-high_accuracy: 93.75
-
+        winogrande_accuracy: 71.88
-internlm2_5-7b-hf:
+    internlm2-7b-hf:
-    gsm8k: 37.5
+        gsm8k_accuracy: 53.12
-    GPQA_diamond: 25
+        GPQA_diamond_accuracy: 18.75
-    race-high: 93.75
+        race-high_accuracy: 62.5
-    winogrande: 71.88
+        winogrande_accuracy: 78.12
-
+    internlm2-base-7b-hf:
-internlm2-7b-hf:
+        gsm8k_accuracy: 3.12
-    gsm8k: 53.12
+        GPQA_diamond_accuracy: 21.88
-    GPQA_diamond: 18.75
+        race-high_accuracy: 75
-    race-high: 62.5
+        winogrande_accuracy: 65.62
-    winogrande: 78.12
+    internlm2-1.8b-turbomind:
-
+        gsm8k_accuracy: 12.5
-internlm2-base-7b-hf:
+        GPQA_diamond_accuracy: 9.38
-    gsm8k: 3.12
+        race-high_accuracy: 71.88
-    GPQA_diamond: 21.88
+        winogrande_accuracy: 78.12
-    race-high: 75
+    internlm2_5-7b-turbomind:
-    winogrande: 65.62
+        gsm8k_accuracy: 62.50
-
+        GPQA_diamond_accuracy: 34.38
-internlm2-1.8b-turbomind:
+        race-high_accuracy: 93.75
-    gsm8k: 12.5
+        winogrande_accuracy: 87.50
-    GPQA_diamond: 12.5
+    internlm2-7b-turbomind:
-    race-high: 71.88
+        gsm8k_accuracy: 53.12
-    winogrande: 75
+        GPQA_diamond_accuracy: 21.88
-
+        race-high_accuracy: 71.88
-internlm2_5-7b-turbomind:
+        winogrande_accuracy: 84.38
-    gsm8k: 68.75
+    internlm2-base-7b-turbomind:
-    GPQA_diamond: 31.25
+        gsm8k_accuracy: 37.50
-    race-high: 93.75
+        GPQA_diamond_accuracy: 28.12
-    winogrande: 84.38
+        race-high_accuracy: 81.25
-
+        winogrande_accuracy: 75
-internlm2-7b-turbomind:
+    llama-2-7b-hf:
-    gsm8k: 56.25
+        gsm8k_accuracy: 21.88
-    GPQA_diamond: 21.88
+        GPQA_diamond_accuracy: 21.88
-    race-high: 75
+        race-high_accuracy: 40.62
-    winogrande: 81.25
+        winogrande_accuracy: 71.88
-
+    llama-3_1-8b-hf:
-internlm2-base-7b-turbomind:
+        gsm8k_accuracy: 78.12
-    gsm8k: 40.62
+        GPQA_diamond_accuracy: 25
-    GPQA_diamond: 28.12
+        race-high_accuracy: 90.62
-    race-high: 84.38
+        winogrande_accuracy: 62.5
-    winogrande: 71.88
+    llama-3-8b-hf:
-
+        gsm8k_accuracy: 46.88
-llama-2-7b-hf:
+        GPQA_diamond_accuracy: 6.25
-    gsm8k: 21.88
+        race-high_accuracy: 65.62
-    GPQA_diamond: 21.88
+        winogrande_accuracy: 65.62
-    race-high: 40.62
+    llama-3.1-8b-turbomind:
-    winogrande: 71.88
+        gsm8k_accuracy: 56.25
-
+        GPQA_diamond_accuracy: 9.38
-llama-3_1-8b-hf:
+        race-high_accuracy: 78.12
-    gsm8k: 78.12
+        winogrande_accuracy: 78.12
-    GPQA_diamond: 25
+    llama-3-8b-turbomind:
-    race-high: 90.62
+        gsm8k_accuracy: 50
-    winogrande: 62.5
+        GPQA_diamond_accuracy: 12.50
-
+        race-high_accuracy: 65.62
-llama-3-8b-hf:
+        winogrande_accuracy: 78.12
-    gsm8k: 46.88
+    mistral-7b-v0.2-hf:
-    GPQA_diamond: 6.25
+        gsm8k_accuracy: 31.25
-    race-high: 65.62
+        GPQA_diamond_accuracy: 6.25
-    winogrande: 65.62
+        race-high_accuracy: 62.5
-
+        winogrande_accuracy: 59.38
-llama-3.1-8b-turbomind:
+    mistral-7b-v0.3-hf:
-    gsm8k: 56.25
+        gsm8k_accuracy: 31.25
-    GPQA_diamond: 6.25
+        GPQA_diamond_accuracy: 6.25
-    race-high: 78.12
+        race-high_accuracy: 62.5
-    winogrande: 78.12
+        winogrande_accuracy: 59.38
-
+    mistral-7b-v0.2-vllm:
-llama-3-8b-turbomind:
+        gsm8k_accuracy: 34.38
-    gsm8k: 50
+        GPQA_diamond_accuracy: 6.25
-    GPQA_diamond: 9.38
+        race-high_accuracy: 62.5
-    race-high: 65.62
+        winogrande_accuracy: 65.62
-    winogrande: 78.12
+    qwen2.5-7b-hf:
-
+        gsm8k_accuracy: 81.25
-mistral-7b-v0.2-hf:
+        GPQA_diamond_accuracy: 18.75
-    gsm8k: 31.25
+        race-high_accuracy: 87.5
-    GPQA_diamond: 6.25
+        winogrande_accuracy: 71.88
-    race-high: 62.5
+    qwen2.5-1.5b-turbomind:
-    winogrande: 59.38
+        gsm8k_accuracy: 62.50
-
+        GPQA_diamond_accuracy: 12.50
-mistral-7b-v0.3-hf:
+        race-high_accuracy: 78.12
-    gsm8k: 31.25
+        winogrande_accuracy: 68.75
-    GPQA_diamond: 6.25
+    qwen2.5-7b-turbomind:
-    race-high: 62.5
+        gsm8k_accuracy: 75.00
-    winogrande: 59.38
+        GPQA_diamond_accuracy: 25
-
+        race-high_accuracy: 87.5
-mistral-7b-v0.2-vllm:
+        winogrande_accuracy: 71.88
-    gsm8k: 34.38
+    qwen1.5-moe-a2.7b-hf:
-    GPQA_diamond: 6.25
+        gsm8k_accuracy: 62.5
-    race-high: 62.5
+        GPQA_diamond_accuracy: 18.75
-    winogrande: 65.62
+        race-high_accuracy: 84.38
-
+        winogrande_accuracy: 75
-qwen2.5-7b-hf:
+    qwen2-0.5b-hf:
-    gsm8k: 81.25
+        gsm8k_accuracy: 25
-    GPQA_diamond: 18.75
+        GPQA_diamond_accuracy: 0
-    race-high: 87.5
+        race-high_accuracy: 40.62
-    winogrande: 71.88
+        winogrande_accuracy: 62.5
-
+    qwen2-1.5b-hf:
-qwen2.5-1.5b-turbomind:
+        gsm8k_accuracy: 59.38
-    gsm8k: 71.88
+        GPQA_diamond_accuracy: 9.38
-    GPQA_diamond: 15.62
+        race-high_accuracy: 81.25
-    race-high: 78.12
+        winogrande_accuracy: 62.5
-    winogrande: 71.88
+    qwen2-7b-hf:
-
+        gsm8k_accuracy: 68.75
-qwen2.5-7b-turbomind:
+        GPQA_diamond_accuracy: 9.38
-    gsm8k: 71.88
+        race-high_accuracy: 87.5
-    GPQA_diamond: 25
+        winogrande_accuracy: 68.75
-    race-high: 87.5
+    qwen2-1.5b-turbomind:
-    winogrande: 71.88
+        gsm8k_accuracy: 56.25
-
+        GPQA_diamond_accuracy: 9.38
-qwen1.5-moe-a2.7b-hf:
+        race-high_accuracy: 81.25
-    gsm8k: 62.5
+        winogrande_accuracy: 75
-    GPQA_diamond: 18.75
+    qwen2-7b-turbomind:
-    race-high: 84.38
+        gsm8k_accuracy: 75.00
-    winogrande: 75
+        GPQA_diamond_accuracy: 12.5
-
+        race-high_accuracy: 87.5
-qwen2-0.5b-hf:
+        winogrande_accuracy: 71.88
-    gsm8k: 25
+    qwen1.5-0.5b-vllm:
-    GPQA_diamond: 0
+        gsm8k_accuracy: 9.38
-    race-high: 40.62
+        GPQA_diamond_accuracy: 0
-    winogrande: 62.5
+        race-high_accuracy: 56.25
-
+        winogrande_accuracy: 62.5
-qwen2-1.5b-hf:
+    yi-1.5-6b-hf:
-    gsm8k: 59.38
+        gsm8k_accuracy: 62.5
-    GPQA_diamond: 9.38
+        GPQA_diamond_accuracy: 3.12
-    race-high: 81.25
+        race-high_accuracy: 87.5
-    winogrande: 62.5
+        winogrande_accuracy: 62.5
-
+    yi-1.5-9b-hf:
-qwen2-7b-hf:
+        gsm8k_accuracy: 75
-    gsm8k: 68.75
+        GPQA_diamond_accuracy: 40.62
-    GPQA_diamond: 9.38
+        race-high_accuracy: 87.5
-    race-high: 87.5
+        winogrande_accuracy: 59.38
-    winogrande: 68.75
+    yi-1.5-9b-turbomind:
-
+        gsm8k_accuracy: 78.12
-qwen2-1.5b-turbomind:
+        GPQA_diamond_accuracy: 40.62
-    gsm8k: 62.50
+        race-high_accuracy: 87.5
-    GPQA_diamond: 6.25
+        winogrande_accuracy: 71.88
-    race-high: 81.25
+    deepseek-v2-lite-hf:
-    winogrande: 75
+        gsm8k_accuracy: 31.25
-
+        GPQA_diamond_accuracy: 28.12
-qwen2-7b-turbomind:
+        race-high_accuracy: 59.38
-    gsm8k: 68.75
+        winogrande_accuracy: 71.88
-    GPQA_diamond: 12.5
+    internlm2-20b-hf:
-    race-high: 87.5
+        gsm8k_accuracy: 56.25
-    winogrande: 71.88
+        GPQA_diamond_accuracy: 15.62
-
+        race-high_accuracy: 68.75
-qwen1.5-0.5b-vllm:
+        winogrande_accuracy: 75
-    gsm8k: 9.38
+    internlm2-base-20b-hf:
-    GPQA_diamond: 0
+        gsm8k_accuracy: 12.5
-    race-high: 56.25
+        GPQA_diamond_accuracy: 9.38
-    winogrande: 62.5
+        race-high_accuracy: 84.38
-
+        winogrande_accuracy: 65.62
-yi-1.5-6b-hf:
+    internlm2-20b-turbomind:
-    gsm8k: 62.5
+        gsm8k_accuracy: 71.88
-    GPQA_diamond: 3.12
+        GPQA_diamond_accuracy: 15.62
-    race-high: 87.5
+        race-high_accuracy: 68.75
-    winogrande: 62.5
+        winogrande_accuracy: 81.25
-
+    qwen2.5-14b-hf:
-yi-1.5-9b-hf:
+        gsm8k_accuracy: 75
-    gsm8k: 75
+        GPQA_diamond_accuracy: 37.5
-    GPQA_diamond: 40.62
+        race-high_accuracy: 93.75
-    race-high: 87.5
+        winogrande_accuracy: 84.38
-    winogrande: 59.38
+    qwen2.5-32b-hf:
-
+        gsm8k_accuracy: 87.5
-deepseek-v2-lite-hf:
+        GPQA_diamond_accuracy: 31.25
-    gsm8k: 28.12
+        race-high_accuracy: 93.75
-    GPQA_diamond: 21.88
+        winogrande_accuracy: 78.12
-    race-high: 59.38
+    qwen2.5-32b-turbomind:
-    winogrande: 75
+        gsm8k_accuracy: 84.38
-
+        GPQA_diamond_accuracy: 28.12
-internlm2-20b-hf:
+        race-high_accuracy: 93.75
-    gsm8k: 56.25
+        winogrande_accuracy: 81.25
-    GPQA_diamond: 15.62
+    deepseek-67b-base-hf:
-    race-high: 68.75
+        gsm8k_accuracy: 59.38
-    winogrande: 75
+        GPQA_diamond_accuracy: 31.25
-
+        race-high_accuracy: 81.25
-internlm2-base-20b-hf:
+        winogrande_accuracy: 90.62
-    gsm8k: 12.5
+    deepseek-67b-base-turbomind:
-    GPQA_diamond: 9.38
+        gsm8k_accuracy: 56.25
-    race-high: 84.38
+        GPQA_diamond_accuracy: 28.12
-    winogrande: 65.62
+        race-high_accuracy: 81.25
-
+        winogrande_accuracy: 84.38
-internlm2-20b-turbomind:
+    llama-3-70b-turbomind:
-    gsm8k: 68.75
+        gsm8k_accuracy: 59.38
-    GPQA_diamond: 15.62
+        GPQA_diamond_accuracy: 9.38
-    race-high: 68.75
+        race-high_accuracy: 93.75
-    winogrande: 81.25
+        winogrande_accuracy: 84.38
-
+    qwen2.5-72b-turbomind:
-qwen2.5-14b-hf:
+        gsm8k_accuracy: 84.38
-    gsm8k: 75
+        GPQA_diamond_accuracy: 34.38
-    GPQA_diamond: 37.5
+        race-high_accuracy: 93.75
-    race-high: 93.75
+        winogrande_accuracy: 87.5
-    winogrande: 84.38
+    deepseek-v2-turbomind:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 81.25
        winogrande_accuracy: 68.75
    llama-3-70b-hf:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 93.75
        winogrande_accuracy: 84.38
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -28,39 +28,39 @@ on:
        description: 'Set branch or tag or commit id. Default is "main"'
        type: string
        default: 'main'
-      regression_func:
+      regression_func_volc:
        required: true
        description: 'regression functions'
        type: string
-        default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']"
+        default: "['chat_models','base_models', 'chat_obj_fullbench', 'base_fullbench']"
-      cuda_env:
+      regression_func_local:
        required: true
-        description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']"
+        description: 'regression functions'
        type: string
-        default: "['dsw_cu12']"
+        default: "['cmd', 'api', 'chat_sub_fullbench']"
      fullbench_eval:
        required: true
        description: 'fullbench volc functions'
        type: string
        default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
  schedule:
-    - cron:  '15 16 * * *'
+    - cron:  '15 14 * * *'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 env:
  CONDA_ENV: opencompass_regression
  PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
  HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
  HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
  COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache
  HF_DATASETS_OFFLINE: 1
  HF_EVALUATE_OFFLINE: 1
  TRANSFORMERS_OFFLINE: 1
  VLLM_USE_MODELSCOPE: false
  LMDEPLOY_USE_MODELSCOPE: false
  HF_HUB_OFFLINE: 1
  TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
  REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report
  OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
  CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
  PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
  REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression
  COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
  HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
  HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
  CONDA_ENV: regression_test
 jobs:
  build-pypi:
@ -124,11 +124,7 @@ jobs:
  prepare_env:
    if: ${{!cancelled()}}
    needs: ['build-pypi', 'build-pypi-lmdeploy']
-    strategy:
+    runs-on: volc_cu12
      fail-fast: false
      matrix:
        cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
    runs-on: ${{ matrix.cuda_env }}
    environment: 'prod'
    timeout-minutes: 240 #4hours
    steps:
@ -144,71 +140,52 @@ jobs:
      - name:  Remove Conda Env
        if: always()
        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
-          conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda env remove -y --name ${{env.CONDA_ENV}}
          conda info --envs
      - name: Prepare - create conda env and install torch - cu11
        if: ${{matrix.cuda_env == 'dsw_cu11'}}
        uses: nick-fields/retry@v3
        id: retry1
        with:
          max_attempts: 3
          timeout_minutes: 40
          command: |
            . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
            conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
            conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
            pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            pip uninstall torch torchvision torchaudio -y
            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            conda info --envs
            pip list
      - name: Prepare - create conda env and install torch - cu12
        if: ${{matrix.cuda_env == 'dsw_cu12'}}
        uses: nick-fields/retry@v3
        id: retry2
        with:
-          max_attempts: 3
+          max_attempts: 1
-          timeout_minutes: 40
+          timeout_minutes: 240
          command: |
-            . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+            . ${{env.CONDA_PATH}}/bin/activate
-            conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
+            conda create -y --name ${{env.CONDA_ENV}} python=3.10
-            conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+            conda activate ${{env.CONDA_ENV}}
-            pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install -r /fs-computility/llm/qa-llm-cicd/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
-            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
-            conda info --envs
+            cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
            pip list
      - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
+        if: ${{inputs.build_lmdeploy}}
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}-py310
      - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
+        if: ${{inputs.build_lmdeploy}}
        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          . ${{env.CONDA_PATH}}/bin/activate
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda activate ${{env.CONDA_ENV}}
          pip install lmdeploy-*.whl --no-deps
      - name: conda env
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          pip list
-  daily_run_test:
+  daily_run_test_volc:
    if: ${{!cancelled()}}
    needs: prepare_env
    strategy:
      fail-fast: false
      matrix:
-        cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
+        regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
-        regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}}
+    runs-on: volc_cu12_daily
    runs-on: ${{ matrix.cuda_env }}
    environment: 'prod'
    timeout-minutes: 240 #4hours
    steps:
@ -217,97 +194,147 @@ jobs:
        with:
          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Prepare - prepare data and hf model
+      - name: conda env
        run: |
-          rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
+          . ${{env.CONDA_PATH}}/bin/activate
-          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
+          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          pip list
      - name:  modify config
        if: matrix.regression_func != 'chat_sub_fullbench'
        run: |
          cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
          cat /fs-computility/llm/qa-llm-cicd/config/test_config.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
      - name:  Run test
        uses: nick-fields/retry@v3
        with:
          max_attempts: 1
          timeout_minutes: 120
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
            conda activate ${{env.CONDA_ENV}}
            conda info --envs
            opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
            rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
            python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
  daily_run_test_local:
    if: ${{!cancelled()}}
    needs: prepare_env
    strategy:
      fail-fast: false
      matrix:
        regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
    runs-on: volc_cu12_local
    environment: 'prod'
    timeout-minutes: 240 #4hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
        with:
          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: conda env
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          pip list
      - name:  modify config
        if: matrix.regression_func == 'chat_sub_fullbench'
        run: |
          cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
          cat /fs-computility/llm/qa-llm-cicd/config/test_config_sub.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
      - name:  Run command testcase
        if: matrix.regression_func == 'cmd'
        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          . ${{env.CONDA_PATH}}/bin/activate
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          export from_tf=TRUE
          python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
          python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
          python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
          python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
          python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run chat model test
        if: matrix.regression_func == 'chat_models'
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
          opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run base model test
        if: matrix.regression_func == 'base_models'
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
          opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run chat model test - fullbench
        if: matrix.regression_func == 'chat_obj_fullbench'
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
          opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run chat model test - fullbench
        if: matrix.regression_func == 'chat_sub_fullbench'
        env:
          COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
          opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run base model test - fullbench
        if: matrix.regression_func == 'base_fullbench'
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
          opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run model test - api
        if: matrix.regression_func == 'api'
        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          . ${{env.CONDA_PATH}}/bin/activate
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
          echo "restful_pid=$!" >> "$GITHUB_ENV"
          sleep 120s
-          opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
          python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run model test - api kill
        if: always() && matrix.regression_func == 'api'
        run: |
          kill -15 "$restful_pid"
      - name:  Run testcase
        if: matrix.regression_func == 'chat_sub_fullbench'
        env:
          COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache_subset
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          export from_tf=TRUE
          opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
          python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
  fullbench_run_test:
    if: ${{!cancelled()}}
    needs: prepare_env
    strategy:
      fail-fast: false
      matrix:
        function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
    runs-on: volc_cu12
    environment: 'prod'
    timeout-minutes: 360 #6hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
        with:
          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: conda env
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          pip list
      - name:  Run testcase
        uses: nick-fields/retry@v3
        with:
          max_attempts: 1
          timeout_minutes: 240
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
            conda activate ${{env.CONDA_ENV}}
            conda info --envs
            export from_tf=TRUE
            opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
            rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily
            python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py
  notify_to_feishu:
    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
-    needs: [daily_run_test]
+    needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
    environment: 'prod'
    timeout-minutes: 5
    runs-on: self-hosted
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@ -18,18 +18,23 @@ concurrency:
  cancel-in-progress: true
 env:
-  CONDA_ENV: opencompass_
+  CONDA_ENV: pr_test
  USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
  HF_DATASETS_OFFLINE: 1
  HF_EVALUATE_OFFLINE: 1
  TRANSFORMERS_OFFLINE: 1
  HF_HUB_OFFLINE: 1
  VLLM_USE_MODELSCOPE: false
  LMDEPLOY_USE_MODELSCOPE: false
  HF_HUB_OFFLINE: 1
  CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
  PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
  REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/prtest
  COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
  HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
  HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
 jobs:
  pr_run_test:
-    runs-on: self-hosted
+    runs-on: volc_cu12_local
    environment: 'prod'
    timeout-minutes: 30
    steps:
@ -37,54 +42,55 @@ jobs:
        uses: actions/checkout@v2
      - name: Prepare - Install opencompass
        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          . ${{env.CONDA_PATH}}/bin/activate
-          conda activate ${{env.CONDA_ENV}}${{ runner.name }}
+          conda activate ${{env.CONDA_ENV}}
          python3 -m pip uninstall opencompass -y
-          python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip
+          python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
          conda info --envs
-      - name: Prepare - prepare data and hf model
+      - name: conda env
        run: |
-          cp -r ${{env.USERSPACE_PREFIX}}/data .
+          . ${{env.CONDA_PATH}}/bin/activate
-          rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
+          conda activate ${{env.CONDA_ENV}}
-          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
+          conda info --envs
          pip list
          lmdeploy check_env
      - name:  Run test
        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          . ${{env.CONDA_PATH}}/bin/activate
-          conda activate ${{env.CONDA_ENV}}${{ runner.name }}
+          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          rm -rf regression_result
-          opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result1 --debug
+          opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1 --debug
-          opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result2 --debug --max-num-workers 2
+          opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2 --debug --max-num-workers 2
-          opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir regression_result3 --debug --max-num-workers 2
+          opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3 --debug --max-num-workers 2
      - name:  Get result
        run: |
-          score=$(sed -n '$p' regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}')
+          score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}')
          if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then
             echo "score is $score between 88 and 89"
          else
             echo "score is $score not between 88 and 89"
             exit 1
          fi
-          score=$(sed -n '$p' regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}')
+          score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}')
          if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then
             echo "score is $score between 87 and 88"
          else
             echo "score is $score not between 87 and 88"
             exit 1
          fi
-          score=$(sed -n '$p' regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}')
+          score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}')
-          if (( ${score%.*} >= 87 && ${score%.*} <= 89 )); then
+          if (( ${score%.*} >= 87 && ${score%.*} <= 91 )); then
-             echo "score is $score between 87 and 89"
+             echo "score is $score between 87 and 91"
          else
-             echo "score is $score not between 87 and 89"
+             echo "score is $score not between 87 and 91"
             exit 1
          fi
          rm -rf regression_result1 & rm -rf regression_result2 & rm -rf regression_result3
      - name:  Uninstall opencompass
        if: always()
        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          . ${{env.CONDA_PATH}}/bin/activate
-          conda activate ${{env.CONDA_ENV}}${{ runner.name }}
+          conda activate ${{env.CONDA_ENV}}
          python3 -m pip uninstall opencompass -y
          conda info --envs
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@ -1,21 +1,26 @@
 name: deploy
-on: push
+on:
-
+  push:
-concurrency:
+  workflow_dispatch:
-  group: ${{ github.workflow }}-${{ github.ref }}
+    inputs:
-  cancel-in-progress: true
+      confirm_publish:
        description: 'Type YES to confirm publishing to PyPI'
        required: true
        type: string
 jobs:
  build-n-publish:
    runs-on: ubuntu-latest
-    if: startsWith(github.event.ref, 'refs/tags')
+    if: |
      github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') ||
      (github.event_name == 'workflow_dispatch' && inputs.confirm_publish == 'YES')
    steps:
      - uses: actions/checkout@v2
-      - name: Set up Python 3.7
+      - name: Set up Python 3.10
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v4
        with:
-          python-version: 3.7
+          python-version: '3.10'
      - name: Build lagent
        run: |
          pip install wheel
--- a/README.md
+++ b/README.md
@ -79,6 +79,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
 You can also refer to [CompassAcademic](configs/eval_academic_leaderboard_202412.py) to quickly reproduce the leaderboard results. The currently selected datasets include Knowledge Reasoning (MMLU-Pro/GPQA Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Generation (LiveCodeBench, HumanEval), and Instruction Following (IFEval)."
 <p align="right"><a href="#top">🔝Back to top</a></p>
 ## 🛠️ Installation
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -77,6 +77,8 @@
 我们将陆续提供开源模型和 API 模型的具体性能榜单，请见 [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) 。如需加入评测，请提供模型仓库地址或标准的 API 接口至邮箱  `opencompass@pjlab.org.cn`.
 你也可以参考[CompassAcademic](configs/eval_academic_leaderboard_202412.py)，快速地复现榜单的结果，目前选取的数据集包括 综合知识推理 (MMLU-Pro/GPQA Diamond) ,逻辑推理 (BBH) ,数学推理 (MATH-500, AIME) ,代码生成 (LiveCodeBench, HumanEval) ,指令跟随 (IFEval) 。
 <p align="right"><a href="#top">🔝返回顶部</a></p>
 ## 🛠️ 安装指南
--- a/configs/datasets/ruler/ruler_128k_gen.py
+++ b/configs/datasets/ruler/ruler_128k_gen.py
@ -1,3 +1,5 @@
 import os
 from mmengine.config import read_base
 with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100  # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 128]
 abbr_suffixs = ['128k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_16k_gen.py
+++ b/configs/datasets/ruler/ruler_16k_gen.py
@ -1,3 +1,4 @@
 import os
 from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 16]
 abbr_suffixs = ['16k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_1m_gen.py
+++ b/configs/datasets/ruler/ruler_1m_gen.py
@ -1,3 +1,4 @@
 import os
 from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 1024]
 abbr_suffixs = ['1m']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_32k_gen.py
+++ b/configs/datasets/ruler/ruler_32k_gen.py
@ -1,3 +1,4 @@
 import os
 from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 32]
 abbr_suffixs = ['32k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_4k_gen.py
+++ b/configs/datasets/ruler/ruler_4k_gen.py
@ -1,3 +1,5 @@
 import os
 from mmengine.config import read_base
 with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100  # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 4]
 abbr_suffixs = ['4k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_64k_gen.py
+++ b/configs/datasets/ruler/ruler_64k_gen.py
@ -1,3 +1,5 @@
 import os
 from mmengine.config import read_base
 with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100  # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 64]
 abbr_suffixs: list[str] = ['64k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_8k_gen.py
+++ b/configs/datasets/ruler/ruler_8k_gen.py
@ -1,3 +1,4 @@
 import os
 from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 8]
 abbr_suffixs = ['8k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/configs/datasets/ruler/ruler_niah_gen.py
+++ b/configs/datasets/ruler/ruler_niah_gen.py
@ -1,9 +1,7 @@
 from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset, RulerNiahEvaluator
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset
 from opencompass.datasets.ruler.ruler_niah import RulerNiahEvaluator
 # Ruler Dataset settings
 niah_configurations = [
@ -92,10 +90,7 @@ for index, config in enumerate(niah_configurations):
        'type': RulerNiahDataset,
        'base_path': base_path,
        'file_path': file_path,
        # 'tokenizer_model': model_path,
        'tokens_to_generate': 128,
        # 'max_seq_length': max_seq_len,
        # 'num_samples': NUM_SAMPLES,
        'type_haystack': config['type_haystack'],
        'type_needle_k': config['type_needle_k'],
        'type_needle_v': config['type_needle_v'],
--- a/configs/eval_academic_leaderboard_202412.py
+++ b/configs/eval_academic_leaderboard_202412.py
@ -10,12 +10,10 @@ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
 #######################################################################
 with read_base():
    # Datasets Part
    ## Core Set
    # Knowledge
    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import (
        mmlu_pro_datasets,
    )
    # General Reasoning
    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import (
        gpqa_datasets,
@ -23,22 +21,19 @@ with read_base():
    from opencompass.configs.datasets.bbh.bbh_0shot_nocot_gen_925fc4 import (
        bbh_datasets,
    )
-    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import (
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
        humaneval_datasets,
    )
    # Instruction Following
-    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import (
+    from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import (
        ifeval_datasets,
    )
-    from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import (
+    from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import (
        LCBCodeGeneration_dataset,
    )
    # Math
    from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import (
        cmo_fib_datasets,
    )
    from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import (
        aime2024_datasets,
    )
@ -77,7 +72,6 @@ core_summary_groups = [
            ['IFEval', 'Prompt-level-strict-accuracy'],
            ['bbh', 'naive_average'],
            ['math_prm800k_500', 'accuracy'],
            ['cmo_fib', 'accuracy'],
            ['aime2024', 'accuracy'],
            ['GPQA_diamond', 'accuracy'],
            ['mmlu_pro', 'naive_average'],
@ -101,7 +95,6 @@ summarizer = dict(
        '',
        'Math Calculation',
        ['math_prm800k_500', 'accuracy'],
        ['cmo_fib', 'accuracy'],
        ['aime2024', 'accuracy'],
        '',
        'Knowledge',
--- a/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py
+++ b/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py
@ -0,0 +1,164 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import (
    LCBCodeGenerationDataset,
    LCBCodeExecutionDataset,
    LCBTestOutputPredictionDataset,
    LCBCodeGenerationEvaluator,
    LCBCodeExecutionEvaluator,
    LCBTestOutputEvaluator
 )
 from opencompass.datasets.livecodebench import TestOutputPromptConstants
 lcb_code_generation_reader_cfg = dict(
    input_columns=[
        'question_content',
        'format_prompt',
    ],
    # output_column='evaluation_sample',
    output_column='question_id',
 )
 SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
 prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
                    '### Answer: (use the provided format with backticks)\n\n'
 # Code Generation Tasks
 lcb_code_generation_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt=prompt_template
                )
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer)
 )
 lcb_code_generation_eval_cfg = dict(
    evaluator=dict(
        type=LCBCodeGenerationEvaluator,
        num_process_evaluate=4,
        timeout=6,
    ),
    pred_role='BOT',
 )
 LCBCodeGeneration_dataset = dict(
    type=LCBCodeGenerationDataset,
    abbr='lcb_code_generation',
    path='opencompass/code_generation_lite',
    reader_cfg=lcb_code_generation_reader_cfg,
    infer_cfg=lcb_code_generation_infer_cfg,
    eval_cfg=lcb_code_generation_eval_cfg
 )
 # Code Execution Dataset
 lcb_code_execution_reader_cfg = dict(
    input_columns=[
        'prompt',
    ],
    output_column='evaluation_sample',
 )
 lcb_code_execution_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            begin=[
                dict(
                    role='SYSTEM',
                    fallback_role='HUMAN',
                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
                ),
            ],
            round=[
                dict(
                    role='HUMAN',
                    prompt='{prompt}'
                )
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer)
 )
 lcb_code_execution_eval_cfg = dict(
    evaluator=dict(
        type=LCBCodeExecutionEvaluator,
    ),
    pred_role='BOT',
 )
 LCBCodeExecution_dataset = dict(
    type=LCBCodeExecutionDataset,
    abbr='lcb_code_execution',
    path='opencompass/execution-v2',
    reader_cfg=lcb_code_execution_reader_cfg,
    infer_cfg=lcb_code_execution_infer_cfg,
    eval_cfg=lcb_code_execution_eval_cfg,
 )
 # TestOuputput Dataset
 lcb_test_output_reader_cfg = dict(
    input_columns=[
        'prompt',
    ],
    output_column='evaluation_sample',
 )
 system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
 lcb_test_output_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            # begin=[
            #     dict(
            #         role='SYSTEM',
            #         prompt=system_prompt
            #     ),
            # ],
            round=[
                dict(
                    role='HUMAN',
                    prompt='{prompt}'
                )
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer)
 )
 lcb_test_output_eval_cfg = dict(
    evaluator=dict(
        type=LCBTestOutputEvaluator,
    ),
    pred_role='BOT',
 )
 LCBTestOutput_dataset = dict(
    type=LCBTestOutputPredictionDataset,
    abbr='lcb_test_output',
    path='opencompass/test_generation',
    reader_cfg=lcb_test_output_reader_cfg,
    infer_cfg=lcb_test_output_infer_cfg,
    eval_cfg=lcb_test_output_eval_cfg,
 )
 LCB_datasets = [
    LCBCodeGeneration_dataset,
    LCBCodeExecution_dataset,
    LCBTestOutput_dataset,
 ]
--- a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py
@ -23,7 +23,7 @@ math_infer_cfg = dict(
        ),
    ),
    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=1024),
+    inferencer=dict(type=GenInferencer),
 )
 # postprocess v2
--- a/opencompass/configs/datasets/ruler/ruler_128k_gen.py
+++ b/opencompass/configs/datasets/ruler/ruler_128k_gen.py
@ -1,3 +1,5 @@
 import os
 from mmengine.config import read_base
 with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100  # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 128]
 abbr_suffixs = ['128k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/ruler/ruler_16k_gen.py
+++ b/opencompass/configs/datasets/ruler/ruler_16k_gen.py
@ -1,3 +1,4 @@
 import os
 from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 16]
 abbr_suffixs = ['16k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/ruler/ruler_1m_gen.py
+++ b/opencompass/configs/datasets/ruler/ruler_1m_gen.py
@ -1,3 +1,4 @@
 import os
 from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 1024]
 abbr_suffixs = ['1m']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/ruler/ruler_32k_gen.py
+++ b/opencompass/configs/datasets/ruler/ruler_32k_gen.py
@ -1,3 +1,4 @@
 import os
 from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 32]
 abbr_suffixs = ['32k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/ruler/ruler_4k_gen.py
+++ b/opencompass/configs/datasets/ruler/ruler_4k_gen.py
@ -1,3 +1,5 @@
 import os
 from mmengine.config import read_base
 with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100  # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 4]
 abbr_suffixs = ['4k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/ruler/ruler_64k_gen.py
+++ b/opencompass/configs/datasets/ruler/ruler_64k_gen.py
@ -1,3 +1,5 @@
 import os
 from mmengine.config import read_base
 with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100  # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 64]
 abbr_suffixs: list[str] = ['64k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/ruler/ruler_8k_gen.py
+++ b/opencompass/configs/datasets/ruler/ruler_8k_gen.py
@ -1,3 +1,4 @@
 import os
 from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
 tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 8]
 abbr_suffixs = ['8k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
        tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
        tmp_dataset['num_samples'] = NUM_SAMPLES
        tmp_dataset['max_seq_length'] = max_seq_len
        tmp_dataset['tokenizer_model'] = tokenizer_model
        ruler_datasets.append(tmp_dataset)
--- a/opencompass/configs/datasets/ruler/ruler_niah_gen.py
+++ b/opencompass/configs/datasets/ruler/ruler_niah_gen.py
@ -1,9 +1,7 @@
 from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset, RulerNiahEvaluator
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset
 from opencompass.datasets.ruler.ruler_niah import RulerNiahEvaluator
 # Ruler Dataset settings
 niah_configurations = [
@ -92,10 +90,7 @@ for index, config in enumerate(niah_configurations):
        'type': RulerNiahDataset,
        'base_path': base_path,
        'file_path': file_path,
        # 'tokenizer_model': model_path,
        'tokens_to_generate': 128,
        # 'max_seq_length': max_seq_len,
        # 'num_samples': NUM_SAMPLES,
        'type_haystack': config['type_haystack'],
        'type_needle_k': config['type_needle_k'],
        'type_needle_v': config['type_needle_v'],
--- a/opencompass/datasets/bigcodebench/bigcodebench.py
+++ b/opencompass/datasets/bigcodebench/bigcodebench.py
@ -163,6 +163,8 @@ class BigCodeBenchEvaluator(BaseEvaluator):
                logger.info('Read timeout error. Retrying in 4s...')
                time.sleep(4)
        if 'pass@1' in pass_at_k.keys():
            pass_at_k['pass@1'] *= 100
        dump_results = {'details': results}
        dump_results.update(pass_at_k)
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -147,7 +147,7 @@ class OpenAI(BaseAPIModel):
        self.path = path
        self.max_completion_tokens = max_completion_tokens
        self.logger.warning(
-            f'Max Completion tokens for {path} is :{max_completion_tokens}')
+            f'Max Completion tokens for {path} is {max_completion_tokens}')
    def generate(self,
                 inputs: List[PromptType],
@ -278,7 +278,7 @@ class OpenAI(BaseAPIModel):
                    self.logger.warning(
                        f"'max_token' is unsupported for model {self.path}")
                    self.logger.warning(
-                        f'We use max_completion_tokens:'
+                        f'We use max_completion_tokens: '
                        f'{self.max_completion_tokens}for this query')
                    data = dict(
                        model=self.path,
@ -588,13 +588,12 @@ class OpenAISDK(OpenAI):
                self.logger.warning(
                    f"'max_token' is unsupported for model {self.path}")
                self.logger.warning(
-                    f'We use max_completion_tokens:'
+                    f'We use max_completion_tokens: '
                    f'{self.max_completion_tokens}for this query')
                query_data = dict(
                    model=self.path,
                    max_completion_tokens=self.max_completion_tokens,
                    n=1,
                    temperature=self.temperature,
                    messages=messages,
                    extra_body=self.extra_body,
                )
@ -636,8 +635,8 @@ class OpenAISDK(OpenAI):
                if (status_code is not None
                        and status_code in self.status_code_mappings):
                    error_message = self.status_code_mappings[status_code]
-                    self.logger.info(f'Status Code: {status_code},\n'
+                    self.logger.info(f'Status Code: {status_code}, \n'
-                                     f'Original Error Message: {e},\n'
+                                     f'Original Error Message: {e}, \n'
                                     f'Return Message: {error_message} ')
                    return error_message
                else:
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@ -335,7 +335,7 @@ class DLCRunner(BaseRunner):
                pass
        # Lark Report when failed
-        if return_code == -1:
+        if return_code == -1 and self.lark_reporter is not None:
            content = f'DLC job failed. Task name: {task_name}'
            self.lark_reporter.post(title='DLC job failed', content=content)
--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@ -207,9 +207,14 @@ class LocalRunner(BaseRunner):
        task_name = task.name
        pwd = os.getcwd()
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
-        param_file = f'tmp/{os.getpid()}_{index}_params.py'
+        # Using uuid to avoid filename conflict
        import uuid
        uuid_str = str(uuid.uuid4())
        param_file = f'{pwd}/tmp/{uuid_str}_params.py'
        try:
            task.cfg.dump(param_file)
            tmpl = get_command_template(gpu_ids)
@ -236,5 +241,8 @@ class LocalRunner(BaseRunner):
                logger.error(f'task {task_name} fail, see\n{out_path}')
        finally:
            # Clean up
-            os.remove(param_file)
+            if not self.keep_tmp_file:
                os.remove(param_file)
            else:
                pass
        return task_name, result.returncode
--- a/opencompass/runners/volc.py
+++ b/opencompass/runners/volc.py
@ -227,20 +227,20 @@ class VOLCRunner(BaseRunner):
                task_status = os.popen(ask_cmd).read()
                pattern = r'(?<=\[{"Status":").*(?="}\])'
                match = re.search(pattern, task_status)
                if match:
                    task_status = match.group()
                else:
                    task_status = 'Exception'
                if self.debug:
                    print(task_status)
                logs = os.popen(log_cmd).read()
                with open(log_path, 'w', encoding='utf-8') as f:
                    f.write(logs)
-                if task_status in [
+                if match:
-                        'Success', 'Failed', 'Cancelled', 'Exception',
+                    task_status = match.group()
-                        'Killing', 'SuccessHolding', 'FailedHolding'
+                    if task_status in [
-                ]:
+                            'Success', 'Failed', 'Cancelled', 'Exception',
-                    break
+                            'Killing', 'SuccessHolding', 'FailedHolding',
                            'Killed'
                    ]:
                        break
                # If pattern not found or command failed, sleep and retry
                time.sleep(poll_interval)
        else:
            task_status = 'Exception'
--- a/opencompass/summarizers/default_subjective.py
+++ b/opencompass/summarizers/default_subjective.py
@ -4,6 +4,7 @@ import functools
 import getpass
 import math
 import os.path as osp
 from collections import OrderedDict
 from datetime import datetime
 from typing import Any, Dict, List, Optional
@ -110,6 +111,7 @@ class DefaultSubjectiveSummarizer:
                    if not osp.exists(filepath):
                        continue
                    result = mmengine.load(filepath)
                    result = OrderedDict(sorted(result.items()))
                    result.pop('details', None)
                    if idx == 0:
                        raw_results[model_abbr][dataset_abbr] = result
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -2,7 +2,7 @@ absl-py
 accelerate>=0.19.0
 cpm_kernels
 datasets>=2.12.0
-einops==0.5.0
+einops>=0.5.0
 evaluate>=0.3.0
 func_timeout
 fuzzywuzzy
@ -16,7 +16,7 @@ jieba
 json5
 jsonlines
 mmengine-lite
-nltk==3.8
+nltk>=3.7
 numpy>=1.23.4,<2.0.0
 openai
 OpenCC