[ci] react daily test (#1668)

* updaste * update * update * update * update * update * update * update * update * update * updaste * update * update * refactor summarize * update * update * update * update * update * updaste * update * update * update * update * updaste * update * update * update * update * update * updaste * updaste * update * update * update * update * update * update * update * update * update * update * update * Update daily-run-test.yml * Update daily-run-test.yml * update * update * update * update * update * Update daily-run-test.yml * update * update * update * update * update * update * update * update * update * update * update * Update daily-run-test.yml * Update daily-run-test.yml * update * update * Update daily-run-test.yml * update * update * update --------- Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
2025-05-30 16:03:24 +08:00 · 2024-11-12 18:40:27 +08:00 · 2024-11-12 18:40:27 +08:00 · a9d6b6461f
commit a9d6b6461f
parent 3ec178f4a9
11 changed files with 1710 additions and 521 deletions
--- a/.github/scripts/eval_regression_api.py
+++ b/.github/scripts/eval_regression_api.py
@ -0,0 +1,39 @@
+from mmengine.config import read_base
+
+from opencompass.models.openai_api import OpenAISDK
+
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
+        gsm8k_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.race.race_gen import \
+        race_datasets  # noqa: F401, E501
+
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+models = [
+    dict(
+        abbr='lmdeploy-api-test',
+        type=OpenAISDK,
+        key='EMPTY',
+        openai_api_base='http://localhost:23333/v1',
+        path='internlm2',
+        tokenizer_path='internlm/internlm2_5-7b-chat',
+        rpm_verbose=True,
+        meta_template=api_meta_template,
+        query_per_second=128,
+        max_out_len=1024,
+        max_seq_len=4096,
+        temperature=0.01,
+        batch_size=128,
+        retry=20,
+    )
+]
--- a/.github/scripts/eval_regression_base.py
+++ b/.github/scripts/eval_regression_base.py
@ -2,15 +2,21 @@ from mmengine.config import read_base

 with read_base():
    # choose a list of datasets
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
+        gpqa_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
        gsm8k_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.race.race_ppl import \
        race_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
+        winogrande_datasets  # noqa: F401, E501
+    # read hf models - chat models
+    from opencompass.configs.models.chatglm.hf_glm4_9b import \
+        models as hf_glm4_9b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
        models as hf_deepseek_v2_lite_model  # noqa: F401, E501
-    # read hf models - chat models
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
@ -19,34 +25,58 @@ with read_base():
        models as hf_gemma2_2b_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_9b import \
        models as hf_gemma2_9b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma_2b import \
+        models as hf_gemma_2b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma_7b import \
+        models as hf_gemma_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.vllm_gemma_2b import \
+        models as vllm_gemma_2b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.vllm_gemma_7b import \
+        models as vllm_gemma_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
        models as hf_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
        models as hf_internlm2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
+        models as hf_internlm2_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
        models as hf_internlm2_base_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
+        models as hf_internlm2_base_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
        models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
        models as lmdeploy_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \
        models as lmdeploy_internlm2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import \
+        models as lmdeploy_internlm2_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
        models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama2_7b import \
        models as hf_llama2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
+        models as hf_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
        models as hf_llama3_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
        models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
        models as lmdeploy_llama3_8b_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
+        models as hf_mistral_7b_v0_2_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
        models as hf_mistral_7b_v0_3_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
        models as vllm_mistral_7b_v0_2_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \
-        models as vllm_mixtral_8x7b_v0_1_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \
+        models as hf_qwen_2_5_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
+        models as hf_qwen_2_5_14b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
+        models as lmdeploy_qwen2_5_1_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
+        models as lmdeploy_qwen2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
        models as hf_qwen1_5_moe_a2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
@ -65,11 +95,27 @@ with read_base():
        models as hf_yi_1_5_6b_model  # noqa: F401, E501
    from opencompass.configs.models.yi.hf_yi_1_5_9b import \
        models as hf_yi_1_5_9b_model  # noqa: F401, E501
-    from opencompass.configs.summarizers.medium import \
-        summarizer  # noqa: F401, E501

+race_datasets = [race_datasets[1]]
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

 for d in datasets:
-    d['reader_cfg']['test_range'] = '[0:100]'
+    d['reader_cfg']['test_range'] = '[0:32]'
+
+for m in models:
+    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
+        m['engine_config']['max_batch_size'] = 1
+        m['batch_size'] = 1
+models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['gsm8k', 'accuracy'],
+        ['GPQA_diamond', 'accuracy'],
+        ['race-high', 'accuracy'],
+        ['winogrande', 'accuracy'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
--- a/.github/scripts/eval_regression_base_fullbench.py
+++ b/.github/scripts/eval_regression_base_fullbench.py
@ -0,0 +1,184 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \
+        ARC_c_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
+        bbh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
+        cmmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.drop.drop_gen_a2697c import \
+        drop_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
+        GaokaoBench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \
+        gpqa_datasets  # noqa: F401, E501
+    # Corebench v1.7
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
+        gsm8k_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
+        hellaswag_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \
+        humaneval_datasets as humaneval_v2_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \
+        humaneval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
+        math_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
+        mathbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
+        sanitized_mbpp_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
+        mmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
+        mmlu_pro_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \
+        nq_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.race.race_few_shot_ppl import \
+        race_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \
+        BoolQ_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
+        TheoremQA_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
+        triviaqa_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \
+        wikibench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
+        winogrande_datasets  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
+        models as hf_internlm2_5_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
+        models as lmdeploy_internlm2_5_7b_model  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.bbh import \
+        bbh_summary_groups  # noqa: F401, E501
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.GaokaoBench import \
+        GaokaoBench_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu import \
+        mmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups  # noqa: F401, E501
+
+race_datasets = [race_datasets[1]]  # Only take RACE-High
+humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
+bbh_datasets = [
+    x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
+    or 'multistep_arithmetic_two' in x['abbr']
+]
+cmmlu_datasets = [
+    x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
+        'ancient_chinese', 'chinese_civil_service_exam',
+        'chinese_driving_rule', 'chinese_food_culture',
+        'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
+        'chinese_teacher_qualification', 'construction_project_management',
+        'elementary_chinese', 'elementary_commonsense', 'ethnology',
+        'high_school_politics', 'modern_chinese',
+        'traditional_chinese_medicine'
+    ]
+]
+mmlu_datasets = [
+    x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
+        'business_ethics', 'clinical_knowledge', 'college_medicine',
+        'global_facts', 'human_aging', 'management', 'marketing',
+        'medical_genetics', 'miscellaneous', 'nutrition',
+        'professional_accounting', 'professional_medicine', 'virology'
+    ]
+]
+mmlu_pro_datasets = [mmlu_pro_datasets[0]]
+mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
+GaokaoBench_datasets = [
+    x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
+    or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
+]
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['race-high', 'accuracy'],
+        ['ARC-c', 'accuracy'],
+        ['BoolQ', 'accuracy'],
+        ['mmlu_pro', 'naive_average'],
+        ['GPQA_diamond', 'accuracy'],
+        ['cmmlu', 'naive_average'],
+        ['mmlu', 'naive_average'],
+        ['drop', 'accuracy'],
+        ['bbh', 'naive_average'],
+        ['math', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['openai_humaneval_v2', 'humaneval_pass@1'],
+        ['sanitized_mbpp', 'score'],
+        ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
+        ['gsm8k', 'accuracy'],
+        ['GaokaoBench', 'weighted_average'],
+        ['triviaqa_wiki_1shot', 'score'],
+        ['nq_open_1shot', 'score'],
+        ['winogrande', 'accuracy'],
+        ['hellaswag', 'accuracy'],
+        ['TheoremQA', 'score'],
+        '###### MathBench-A: Application Part ######',
+        'college',
+        'high',
+        'middle',
+        'primary',
+        'arithmetic',
+        'mathbench-a (average)',
+        '###### MathBench-T: Theory Part ######',
+        'college_knowledge',
+        'high_knowledge',
+        'middle_knowledge',
+        'primary_knowledge',
+        'mathbench-t (average)',
+        '###### Overall: Average between MathBench-A and MathBench-T ######',
+        'Overall',
+        '',
+        'bbh-logical_deduction_seven_objects',
+        'bbh-multistep_arithmetic_two',
+        '',
+        'mmlu',
+        'mmlu-stem',
+        'mmlu-social-science',
+        'mmlu-humanities',
+        ['mmlu-other', 'accuracy'],
+        'cmmlu',
+        'cmmlu-stem',
+        'cmmlu-social-science',
+        'cmmlu-humanities',
+        'cmmlu-other',
+        ['cmmlu-china-specific', 'accuracy'],
+        'mmlu_pro',
+        'mmlu_pro_biology',
+        'mmlu_pro_business',
+        'mmlu_pro_chemistry',
+        'mmlu_pro_computer_science',
+        'mmlu_pro_economics',
+        'mmlu_pro_engineering',
+        'mmlu_pro_health',
+        'mmlu_pro_history',
+        'mmlu_pro_law',
+        'mmlu_pro_math',
+        'mmlu_pro_philosophy',
+        'mmlu_pro_physics',
+        'mmlu_pro_psychology',
+        'mmlu_pro_other',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+
+for d in datasets:
+    d['reader_cfg']['test_range'] = '[0:16]'
+
+for m in models:
+    m['abbr'] = m['abbr'] + '_fullbench'
+    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
+        m['engine_config']['max_batch_size'] = 1
+        m['batch_size'] = 1
+models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
--- a/.github/scripts/eval_regression_chat.py
+++ b/.github/scripts/eval_regression_chat.py
@ -1,7 +1,5 @@
 from mmengine.config import read_base

-from opencompass.models import OpenAISDK
-
 with read_base():
    # choose a list of datasets
    from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
@ -29,6 +27,12 @@ with read_base():
        models as hf_gemma2_2b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
        models as hf_gemma2_9b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma_2b_it import \
+        models as hf_gemma_2b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma_7b_it import \
+        models as hf_gemma_7b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
+        models as lmdeploy_gemma_9b_it_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
        models as vllm_gemma_7b_it_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
@ -51,18 +55,35 @@ with read_base():
        models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
        models as hf_llama3_1_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_2_3b_instruct import \
+        models as hf_llama3_2_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
        models as hf_llama3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
        models as lmdeploy_llama3_1_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
+        models as lmdeploy_llama3_2_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
        models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
+        models as hf_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
        models as hf_mistral_7b_instruct_v0_3_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_nemo_instruct_2407 import \
+        models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
+        models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
+        models as lmdeploy_mistral_nemo_instruct_2407_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
+        models as \
+        lmdeploy_mistral_small_instruct_2409_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
+        models as vllm_mistral_7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
        models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \
-        models as vllm_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
+    from opencompass.configs.models.openbmb.hf_minicpm3_4b import \
+        models as hf_minicpm3_4b_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
        models as hf_minicpm_2b_dpo_fp32_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
@ -73,6 +94,10 @@ with read_base():
        models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
        models as hf_phi_3_mini_8k_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \
+        models as hf_qwen2_5_14b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
+        models as lmdeploy_qwen2_5_14b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
        models as hf_qwen1_5_0_5b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
@ -89,10 +114,8 @@ with read_base():
        models as hf_yi_1_5_6b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
        models as hf_yi_1_5_9b_chat_model  # noqa: F401, E501
-    from opencompass.configs.summarizers.medium import \
-        summarizer  # noqa: F401, E501

-models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+race_datasets = [race_datasets[1]]
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

 api_meta_template = dict(
@ -103,25 +126,24 @@ api_meta_template = dict(
    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )

-model_name = ''
-
-models.append(
-    dict(
-        abbr='lmdeploy-api-test',
-        type=OpenAISDK,
-        key='EMPTY',
-        openai_api_base='http://judgemodel:10001/v1',
-        path='compass_judger_internlm2_102b_0508',
-        tokenizer_path='internlm/internlm2_5-20b-chat',
-        rpm_verbose=True,
-        meta_template=api_meta_template,
-        query_per_second=50,
-        max_out_len=1024,
-        max_seq_len=4096,
-        temperature=0.01,
-        batch_size=128,
-        retry=3,
-    ))
-
 for d in datasets:
-    d['reader_cfg']['test_range'] = '[0:100]'
+    d['reader_cfg']['test_range'] = '[0:32]'
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+for m in models:
+    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
+        m['engine_config']['max_batch_size'] = 1
+        m['batch_size'] = 1
+
+models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
+
+summarizer = dict(
+    dataset_abbrs=[
+        'gsm8k',
+        'race-middle',
+        'race-high',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
--- a/.github/scripts/eval_regression_chat_objective_fullbench.py
+++ b/.github/scripts/eval_regression_chat_objective_fullbench.py
@ -0,0 +1,246 @@
+from mmengine.config import read_base
+
+with read_base():
+    # read hf models - chat models
+    # Dataset
+    from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
+        ARC_c_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
+        bbh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
+        cmmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
+        drop_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \
+        ds1000_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
+        GaokaoBench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
+        gpqa_datasets  # noqa: F401, E501
+    # new datasets in Fullbench v1.1
+    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
+        gsm8k_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
+        hellaswag_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \
+        humaneval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \
+        humanevalx_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
+        ifeval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
+        math_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
+        mathbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
+        sanitized_mbpp_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
+        mmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
+        mmlu_pro_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
+        nq_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
+        race_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \
+        SciCode_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \
+        BoolQ_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
+        teval_datasets as teval_en_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
+        teval_datasets as teval_zh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
+        TheoremQA_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
+        triviaqa_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \
+        wikibench_datasets  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
+        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.bbh import \
+        bbh_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.ds1000 import \
+        ds1000_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.GaokaoBench import \
+        GaokaoBench_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.humanevalx import \
+        humanevalx_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu import \
+        mmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.scicode import \
+        scicode_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.teval import \
+        teval_summary_groups  # noqa: F401, E501
+
+# For HumanEval-X Evaluation
+# Apply the evaluator ip_address and port
+race_datasets = [race_datasets[1]]
+for item in humanevalx_datasets:
+    item['eval_cfg']['evaluator'][
+        'ip_address'] = 'codeeval.opencompass.org.cn/humanevalx'
+    item['eval_cfg']['evaluator']['port'] = ''
+
+# For DS-1000 Evaluation
+# Apply the evaluator ip_address and port
+for item in ds1000_datasets:
+    item['eval_cfg']['evaluator'][
+        'ip_address'] = 'codeeval.opencompass.org.cn/ds1000'
+    item['eval_cfg']['evaluator']['port'] = ''
+
+bbh_datasets = [
+    x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
+    or 'multistep_arithmetic_two' in x['abbr']
+]
+cmmlu_datasets = [
+    x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
+        'ancient_chinese', 'chinese_civil_service_exam',
+        'chinese_driving_rule', 'chinese_food_culture',
+        'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
+        'chinese_teacher_qualification', 'construction_project_management',
+        'elementary_chinese', 'elementary_commonsense', 'ethnology',
+        'high_school_politics', 'modern_chinese',
+        'traditional_chinese_medicine'
+    ]
+]
+mmlu_datasets = [
+    x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
+        'business_ethics', 'clinical_knowledge', 'college_medicine',
+        'global_facts', 'human_aging', 'management', 'marketing',
+        'medical_genetics', 'miscellaneous', 'nutrition',
+        'professional_accounting', 'professional_medicine', 'virology'
+    ]
+]
+
+mmlu_pro_datasets = [mmlu_pro_datasets[0]]
+mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
+GaokaoBench_datasets = [
+    x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
+    or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
+]
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')
+     and 'scicode' not in k.lower() and 'teval' not in k),
+    [],
+)
+datasets += teval_en_datasets
+datasets += teval_zh_datasets
+# datasets += SciCode_datasets
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['race-high', 'accuracy'],
+        ['ARC-c', 'accuracy'],
+        ['BoolQ', 'accuracy'],
+        ['mmlu_pro', 'naive_average'],
+        ['drop', 'accuracy'],
+        ['bbh', 'naive_average'],
+        ['GPQA_diamond', 'accuracy'],
+        ['math', 'accuracy'],
+        ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['sanitized_mbpp', 'score'],
+        ['cmmlu', 'naive_average'],
+        ['mmlu', 'naive_average'],
+        ['teval', 'naive_average'],
+        ['SciCode', 'accuracy'],
+        ['SciCode', 'sub_accuracy'],
+        ['humanevalx', 'naive_average'],
+        ['ds1000', 'naive_average'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        ['gsm8k', 'accuracy'],
+        ['GaokaoBench', 'weighted_average'],
+        ['triviaqa_wiki_1shot', 'score'],
+        ['nq_open_1shot', 'score'],
+        ['hellaswag', 'accuracy'],
+        ['TheoremQA', 'score'],
+        '###### MathBench-A: Application Part ######',
+        'college',
+        'high',
+        'middle',
+        'primary',
+        'arithmetic',
+        'mathbench-a (average)',
+        '###### MathBench-T: Theory Part ######',
+        'college_knowledge',
+        'high_knowledge',
+        'middle_knowledge',
+        'primary_knowledge',
+        'mathbench-t (average)',
+        '###### Overall: Average between MathBench-A and MathBench-T ######',
+        'Overall',
+        '',
+        'bbh-logical_deduction_seven_objects',
+        'bbh-multistep_arithmetic_two',
+        ''
+        'mmlu',
+        'mmlu-stem',
+        'mmlu-social-science',
+        'mmlu-humanities',
+        'mmlu-other',
+        '',
+        'cmmlu',
+        'cmmlu-stem',
+        'cmmlu-social-science',
+        'cmmlu-humanities',
+        'cmmlu-other',
+        'cmmlu-china-specific',
+        '',
+        'mmlu_pro',
+        'mmlu_pro_biology',
+        'mmlu_pro_business',
+        'mmlu_pro_chemistry',
+        'mmlu_pro_computer_science',
+        'mmlu_pro_economics',
+        'mmlu_pro_engineering',
+        'mmlu_pro_health',
+        'mmlu_pro_history',
+        'mmlu_pro_law',
+        'mmlu_pro_math',
+        'mmlu_pro_philosophy',
+        'mmlu_pro_physics',
+        'mmlu_pro_psychology',
+        'mmlu_pro_other',
+        '',
+        'GaokaoBench_2010-2022_Math_II_MCQs',
+        'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
+        '',
+        'humanevalx-python',
+        'humanevalx-cpp',
+        'humanevalx-go',
+        'humanevalx-java',
+        'humanevalx-js',
+        '',
+        'ds1000_Pandas',
+        'ds1000_Numpy',
+        'ds1000_Tensorflow',
+        'ds1000_Scipy',
+        'ds1000_Sklearn',
+        'ds1000_Pytorch',
+        'ds1000_Matplotlib',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+for d in datasets:
+    d['reader_cfg']['test_range'] = '[0:16]'
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+for m in models:
+    m['abbr'] = m['abbr'] + '_fullbench'
+    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
+        m['engine_config']['max_batch_size'] = 1
+        m['batch_size'] = 1
+
+models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
--- a/.github/scripts/eval_regression_chat_subjective_fullbench.py
+++ b/.github/scripts/eval_regression_chat_subjective_fullbench.py
@ -0,0 +1,70 @@
+from copy import deepcopy
+
+from mmengine.config import read_base
+
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.summarizers import SubjectiveSummarizer
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+with read_base():
+    # read hf models - chat models
+    # Dataset
+    from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
+        alignbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
+        alpacav2_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
+        arenahard_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
+        compassarena_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
+        fofo_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \
+        followbench_llmeval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
+        mtbench101_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
+        wildbench_datasets  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
+        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
+
+summarizer = dict(type=SubjectiveSummarizer, function='subjective')
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
+                and 'mtbench101' not in k and 'wildbench' not in k), [])
+datasets += mtbench101_datasets  # noqa: F401, E501
+datasets += wildbench_datasets  # noqa: F401, E501
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+for m in models:
+    m['abbr'] = m['abbr'] + '_fullbench'
+    if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
+        m['engine_config']['max_batch_size'] = 1
+        m['batch_size'] = 1
+
+models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
+
+judge_models = deepcopy([models[1]])
+judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
+
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=SubjectiveEvalTask)),
+)
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -7,36 +7,56 @@ import yaml
 output_path = 'regression_result_daily'

 chat_model_list = [
-    'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
-    'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf',
-    'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
-    'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind',
-    'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
+    'baichuan2-7b-chat-hf', 'glm-4-9b-chat-hf', 'glm-4-9b-chat-turbomind',
+    'glm-4-9b-chat-vllm', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
+    'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf',
+    'gemma-2b-it-hf', 'gemma-7b-it-hf', 'gemma-2-9b-it-turbomind',
+    'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
+    'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
    'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-lmdeploy',
    'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm',
-    'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf',
-    'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
-    'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm',
-    'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
+    'llama-3_1-8b-instruct-hf', 'llama-3_2-3b-instruct-hf',
+    'llama-3-8b-instruct-hf', 'llama-3_1-8b-instruct-turbomind',
+    'llama-3_2-3b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
+    'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.3-hf',
+    'mistral-nemo-instruct-2407-hf', 'mistral-nemo-instruct-2407-turbomind',
+    'mistral-7b-instruct-v0.1-vllm', 'mistral-7b-instruct-v0.2-vllm',
+    'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
    'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
    'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
    'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
    'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
-    'lmdeploy-api-test'
+    'deepseek-v2-lite-chat-hf', 'internlm2_5-20b-chat-hf',
+    'internlm2_5-20b-chat-turbomind', 'mistral-small-instruct-2409-hf',
+    'mistral-small-instruct-2409-turbomind', 'qwen2.5-14b-instruct-hf',
+    'qwen2.5-14b-instruct-turbomind'
 ]
 base_model_list = [
-    'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf',
-    'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf',
-    'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf',
-    'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind',
-    'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind',
-    'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf',
-    'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf',
-    'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
+    'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
+    'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf',
+    'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm',
+    'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
+    'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind',
+    'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf',
+    'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind',
+    'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf',
+    'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind',
+    'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
    'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
-    'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
+    'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf',
+    'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf',
+    'internlm2-20b-turbomind', 'qwen2.5-14b-hf'
 ]
-dataset_list = ['gsm8k', 'race-middle', 'race-high']
+
+
+@pytest.fixture()
+def baseline_scores_testrange(request):
+    config_path = os.path.join(
+        request.config.rootdir,
+        '.github/scripts/oc_score_baseline_testrange.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    return config


@pytest.fixture()
@ -48,6 +68,16 @@ def baseline_scores(request):
    return config


+@pytest.fixture()
+def baseline_scores_fullbench(request):
+    config_path = os.path.join(
+        request.config.rootdir,
+        '.github/scripts/oc_score_baseline_fullbench.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    return config
+
+
@pytest.fixture()
 def result_scores():
    file = find_csv_files(output_path)
@ -57,100 +87,228 @@ def result_scores():


@pytest.mark.usefixtures('result_scores')
-@pytest.mark.usefixtures('baseline_scores')
+@pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.chat
 class TestChat:
    """Test cases for chat model."""

-    @pytest.mark.parametrize('model, dataset', [(p1, p2)
-                                                for p1 in chat_model_list
-                                                for p2 in dataset_list])
-    def test_model_dataset_score(self, baseline_scores, result_scores, model,
-                                 dataset):
-        base_score = baseline_scores.get(model).get(dataset)
+    @pytest.mark.parametrize('model, dataset',
+                             [(p1, p2) for p1 in chat_model_list
+                              for p2 in ['gsm8k', 'race-high']])
+    def test_model_dataset_score(self, baseline_scores_testrange,
+                                 result_scores, model, dataset):
+        base_score = baseline_scores_testrange.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, base_score)
+        assert_score(model, result_score, base_score)


@pytest.mark.usefixtures('result_scores')
-@pytest.mark.usefixtures('baseline_scores')
+@pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.base
 class TestBase:
    """Test cases for base model."""

-    @pytest.mark.parametrize('model, dataset', [(p1, p2)
-                                                for p1 in base_model_list
-                                                for p2 in dataset_list])
-    def test_model_dataset_score(self, baseline_scores, result_scores, model,
-                                 dataset):
-        if model == 'mistral-7b-v0.2-vllm' and dataset == 'race-high':
+    @pytest.mark.parametrize(
+        'model, dataset',
+        [(p1, p2) for p1 in base_model_list
+         for p2 in ['gsm8k', 'GPQA_diamond', 'race-high', 'winogrande']])
+    def test_model_dataset_score(self, baseline_scores_testrange,
+                                 result_scores, model, dataset):
+        if model in ['gemma-2b-vllm', 'gemma-7b-vllm'] and dataset != 'gsm8k':
            return
-        base_score = baseline_scores.get(model).get(dataset)
+        base_score = baseline_scores_testrange.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, base_score)
+        assert_score(model, result_score, base_score)


@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores_fullbench')
+@pytest.mark.chat_obj_fullbench
+class TestChatObjFullbench:
+    """Test cases for chat model."""
+
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
+        'internlm2_5-7b-chat-hf_fullbench',
+        'internlm2_5-7b-chat-turbomind_fullbench'
+    ] for p2 in [
+        'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
+        'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000',
+        'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'hellaswag',
+        'TheoremQA', 'college', 'college_knowledge',
+        'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
+        'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas',
+        'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn',
+        'ds1000_Pytorch', 'ds1000_Matplotlib'
+    ]])
+    def test_model_dataset_score(self, baseline_scores_fullbench,
+                                 result_scores, model, dataset):
+        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model, result_score, base_score)
+
+
+@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores_fullbench')
+@pytest.mark.chat_sub_fullbench
+class TestChatSubFullbench:
+    """Test cases for chat model."""
+
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
+        'internlm2_5-7b-chat-hf_fullbench',
+        'internlm2_5-7b-chat-turbomind_fullbench'
+    ] for p2 in [
+        'Alignbench总分', 'Alignbench专业能力', 'AlpacaEvaltotal',
+        'AlpacaEvalhelpful_base', 'CompassArenacompassarena_language',
+        'CompassArenacompassarena_knowledge',
+        'CompassArenacompassarena_reason_v2',
+        'CompassArenacompassarena_math_v2',
+        'CompassArenacompassarena_creationv2_zh', 'Fofofofo_test_prompts',
+        'followbenchHSR_AVG', 'followbenchSSR_AVG', 'followbenchHSR_L1',
+        'followbenchHSR_L2', 'followbenchHSR_L3', 'followbenchHSR_L4',
+        'followbenchHSR_L5', 'followbenchSSR_L1', 'followbenchSSR_L2',
+        'followbenchSSR_L3', 'followbenchSSR_L4', 'followbenchSSR_L5',
+        'MTBench101average', 'Wildbenchscore'
+    ]])
+    def test_model_dataset_score(self, baseline_scores_fullbench,
+                                 result_scores, model, dataset):
+        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model, result_score, base_score)
+
+
+@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores_fullbench')
+@pytest.mark.base_fullbench
+class TestBaseFullbench:
+    """Test cases for chat model."""
+
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
+        'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench'
+    ] for p2 in [
+        'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
+        'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k',
+        'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag',
+        'TheoremQA', 'college', 'college_knowledge',
+        'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
+        'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math'
+    ]])
+    def test_model_dataset_score(self, baseline_scores_fullbench,
+                                 result_scores, model, dataset):
+        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model, result_score, base_score)
+
+
+@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores')
+@pytest.mark.api
+class TestApibench:
+    """Test cases for chat model."""
+
+    @pytest.mark.parametrize('model, dataset',
+                             [('lmdeploy-api-test', 'race-middle'),
+                              ('lmdeploy-api-test', 'race-high'),
+                              ('lmdeploy-api-test', 'gsm8k')])
+    def test_api(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model + '_batch', result_score, base_score)
+
+
+@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores')
 class TestCmdCase:

    @pytest.mark.case1
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b-hf', 'race-middle'),
-                              ('internlm2_5-7b-hf', 'race-high')])
-    def test_cmd_case1(self, result_scores, model, dataset):
-        if len(result_scores.keys()) != 1:
-            assert False, 'result is none'
+                              ('internlm2_5-7b-hf', 'race-high'),
+                              ('internlm2_5-7b-hf', 'demo_gsm8k'),
+                              ('internlm2-1.8b-hf', 'race-middle'),
+                              ('internlm2-1.8b-hf', 'race-high'),
+                              ('internlm2-1.8b-hf', 'demo_gsm8k')])
+    def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, 91)
+        assert_score(model, result_score, base_score)

    @pytest.mark.case2
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b-chat-lmdeploy', 'race-middle'),
-                              ('internlm2_5-7b-chat-lmdeploy', 'race-high')])
-    def test_cmd_case2(self, result_scores, model, dataset):
-        if len(result_scores.keys()) != 1:
-            assert False, 'result is none'
+                              ('internlm2_5-7b-chat-lmdeploy', 'race-high'),
+                              ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k'),
+                              ('internlm2-chat-1.8b-lmdeploy', 'race-middle'),
+                              ('internlm2-chat-1.8b-lmdeploy', 'race-high'),
+                              ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k')])
+    def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, 91)
+        assert_score(model + '_batch', result_score, base_score)

    @pytest.mark.case3
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b_hf', 'race-middle'),
-                              ('internlm2_5-7b_hf', 'race-high')])
-    def test_cmd_case3(self, result_scores, model, dataset):
-        if len(result_scores.keys()) != 1:
-            assert False, 'result is none'
+                              ('internlm2_5-7b_hf', 'race-high'),
+                              ('internlm2_5-7b_hf', 'demo_gsm8k')])
+    def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, 91)
+        assert_score(model, result_score, base_score)

    @pytest.mark.case4
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b-chat_hf', 'race-middle'),
-                              ('internlm2_5-7b-chat_hf', 'race-high')])
-    def test_cmd_case4(self, result_scores, model, dataset):
-        if len(result_scores.keys()) != 1:
-            assert False, 'result is none'
+                              ('internlm2_5-7b-chat_hf', 'race-high'),
+                              ('internlm2_5-7b-chat_hf', 'demo_gsm8k')])
+    def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, 91)
+        assert_score(model, result_score, base_score)


-def assert_score(score, baseline):
+THRESHOLD = 3
+
+
+def assert_score(model_type, score, baseline):
    if score is None or score == '-':
        assert False, 'value is none'
-    if float(score) <= (baseline + 5) and float(score) >= (baseline - 5):
-        print(score + ' between ' + str(baseline - 5) + ' and ' +
-              str(baseline + 5))
-        assert True
+
+    if 'batch' not in model_type:
+        if float(score) <= (baseline + 0.01) and float(score) >= (baseline -
+                                                                  0.01):
+            print(' '.join([score, 'is equal', str(baseline)]))
+            assert True
+        else:
+            print(' '.join([score, 'is not equal', str(baseline)]))
+            assert False, ' '.join([score, 'is not equal', str(baseline)])
    else:
-        assert False, score + ' not between ' + str(
-            baseline - 5) + ' and ' + str(baseline + 5)
+        if float(score) <= (baseline + THRESHOLD) and float(score) >= (
+                baseline - THRESHOLD):
+            print(' '.join([
+                score, 'is between',
+                str(baseline - THRESHOLD), 'and',
+                str(baseline + THRESHOLD)
+            ]))
+            assert True
+        else:
+            print(' '.join([
+                score, 'is not etween',
+                str(baseline - THRESHOLD), 'and',
+                str(baseline + THRESHOLD)
+            ]))
+            assert False, ' '.join([
+                score, 'is not etween',
+                str(baseline - THRESHOLD), 'and',
+                str(baseline + THRESHOLD)
+            ])


 def find_csv_files(directory):
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
-            if file.endswith('.csv'):
+            if file.endswith('.csv') and (file.startswith('summary') or
+                                          file.startswith('Subjective_all')):
                csv_files.append(os.path.join(root, file))

    csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
@ -163,14 +321,24 @@ def read_csv_file(file_path):
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        filtered_data = []
-
-        for row in reader:
-            filtered_row = {
-                k: v
-                for k, v in row.items()
-                if k not in ['version', 'metric', 'mode']
-            }
-            filtered_data.append(filtered_row)
+        if 'Subjective_all' not in file_path:
+            for row in reader:
+                if row['metric'] is not None and 'bpb' not in row['metric']:
+                    filtered_row = {
+                        k: v
+                        for k, v in row.items()
+                        if k not in ['version', 'metric', 'mode']
+                    }
+                    filtered_data.append(filtered_row)
+        else:
+            for row in reader:
+                if row['Detailed Scores'] is not None:
+                    filtered_row = row
+                    filtered_row['dataset'] = filtered_row[
+                        'Dataset'] + filtered_row['Detailed Scores']
+                    del filtered_row['Dataset']
+                    del filtered_row['Detailed Scores']
+                    filtered_data.append(filtered_row)

    result = {}
    for data in filtered_data:
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -1,369 +1,34 @@
-baichuan2-7b-chat-hf:
-    gsm8k: 30
-    race-middle: 74
-    race-high: 79
+internlm2_5-7b-hf:
+    demo_gsm8k: 42.19
+    race-middle: 91.78
+    race-high: 90.02

-glm-4-9b-chat-hf:
-    gsm8k: 75
-    race-middle: 88
-    race-high: 88
+internlm2_5-7b_hf:
+    demo_gsm8k: 42.19
+    race-middle: 91.78
+    race-high: 90.02

-glm-4-9b-chat-turbomind:
-    gsm8k: 69
-    race-middle: 82
-    race-high: 77
+internlm2-1.8b-hf:
+    demo_gsm8k: 15.62
+    race-middle: 71.66
+    race-high: 66.38

-glm-4-9b-chat-vllm:
-    gsm8k: 73
-    race-middle: 87
-    race-high: 87
+internlm2_5-7b-chat-lmdeploy:
+    demo_gsm8k: 84.38
+    race-middle: 92.76
+    race-high: 90.54

-deepseek-7b-chat-hf:
-    gsm8k: 60
-    race-middle: 74
-    race-high: 80
+internlm2-chat-1.8b-lmdeploy:
+    demo_gsm8k: 31
+    race-middle: 81.34
+    race-high: 73.96

-deepseek-moe-16b-chat-hf:
-    gsm8k: 62
-    race-middle: 62
-    race-high: 70
-
-deepseek-v2-lite-chat-hf:
-    gsm8k: 59
-    race-middle: 82
-    race-high: 79
-
-deepseek-7b-chat-vllm:
-    gsm8k: 63
-    race-middle: 74
-    race-high: 79
-
-gemma-2b-it-hf:
-    gsm8k: 14
-    race-middle: 62
-    race-high: 52
-
-gemma-7b-it-hf:
-    gsm8k: 39
-    race-middle: 74
-    race-high: 71
-
-gemma-7b-it-vllm:
-    gsm8k: 38
-    race-middle: 75
-    race-high: 70
-
-gemma2-2b-it-hf:
-    gsm8k: 62
-    race-middle: 75
-    race-high: 67
-
-gemma2-9b-it-hf:
-    gsm8k: 80
-    race-middle: 89
-    race-high: 85
-
-internlm2_5-7b-chat-hf:
-    gsm8k: 86
-    race-middle: 92
-    race-high: 93
-
-internlm2_5-20b-chat-hf:
-    gsm8k: 91
-    race-middle: 95
-    race-high: 91
-
-internlm2_5-7b-chat-turbomind:
-    gsm8k: 87
-    race-middle: 92
-    race-high: 93
-
-internlm2_5-20b-chat-turbomind:
-    gsm8k: 91
-    race-middle: 95
-    race-high: 91
-
-internlm2-chat-1.8b-turbomind:
-    gsm8k: 40
-    race-middle: 82
-    race-high: 83
-
-internlm2-chat-1.8b-sft-turbomind:
-    gsm8k: 34
-    race-middle: 81
-    race-high: 83
-
-internlm2-chat-7b-lmdeploy:
-    gsm8k: 69
-    race-middle: 90
-    race-high: 88
-
-internlm2-chat-7b-sft-turbomind:
-    gsm8k: 71
-    race-middle: 91
-    race-high: 92
-
-internlm2-chat-7b-vllm:
-    gsm8k: 63
-    race-middle: 90
-    race-high: 91
-
-llama-3_1-8b-instruct-hf:
-    gsm8k: 82
-    race-middle: 82
-    race-high: 88
-
-llama-3-8b-instruct-hf:
-    gsm8k: 77
-    race-middle: 85
-    race-high: 87
-
-llama-3_1-8b-instruct-turbomind:
-    gsm8k: 79
-    race-middle: 82
-    race-high: 88
-
-llama-3-8b-instruct-turbomind:
-    gsm8k: 77
-    race-middle: 85
-    race-high: 89
-
-mistral-7b-instruct-v0.2-hf:
-    gsm8k: 48
-    race-middle: 82
-    race-high: 78
-
-mistral-7b-instruct-v0.3-hf:
-    gsm8k: 53
-    race-middle: 80
-    race-high: 78
-
-mistral-7b-instruct-v0.2-vllm:
-    gsm8k: 49
-    race-middle: 81
-    race-high: 77
-
-minicpm-2b-dpo-fp32-hf:
-    gsm8k: 58
-    race-middle: 66
-    race-high: 74
-
-minicpm-2b-sft-bf16-hf:
-    gsm8k: 58
-    race-middle: 75
-    race-high: 81
-
-minicpm-2b-sft-fp32-hf:
-    gsm8k: 58
-    race-middle: 75
-    race-high: 81
-
-phi-3-mini-4k-instruct-hf:
-    gsm8k: 67
-    race-middle: 81
-    race-high: 84
-
-phi-3-small-8k-instruct-hf:
-    gsm8k: 88
-    race-middle: 89
-    race-high: 88
-
-qwen1.5-0.5b-chat-hf:
-    gsm8k: 5
-    race-middle: 55
-    race-high: 50
-
-qwen2-1.5b-instruct-hf:
-    gsm8k: 63
-    race-middle: 77
-    race-high: 86
-
-qwen2-1.5b-instruct-turbomind:
-    gsm8k: 60
-    race-middle: 77
-    race-high: 86
-
-qwen2-7b-instruct-turbomind:
-    gsm8k: 88
-    race-middle: 87
-    race-high: 89
-
-qwen2-7b-instruct-hf:
-    gsm8k: 85
-    race-middle: 87
-    race-high: 91
-
-qwen1.5-0.5b-chat-vllm:
-    gsm8k: 5
-    race-middle: 57
-    race-high: 51
-
-yi-1.5-6b-chat-hf:
-    gsm8k: 72
-    race-middle: 88
-    race-high: 86
-
-yi-1.5-9b-chat-hf:
-    gsm8k: 81
-    race-middle: 89
-    race-high: 91
+internlm2_5-7b-chat_hf:
+    demo_gsm8k: 87.50
+    race-middle: 92.76
+    race-high: 90.48

 lmdeploy-api-test:
-    gsm8k: 90
-    race-middle: 95
-    race-high: 96
-
-deepseek-moe-16b-base-hf:
-    gsm8k: 25
-    race-middle: 35
-    race-high: 23
-
-deepseek-v2-lite-hf:
-    gsm8k: 37
-    race-middle: 56
-    race-high: 62
-
-deepseek-7b-base-turbomind:
-    gsm8k: 21
-    race-middle: 42
-    race-high: 42
-
-deepseek-moe-16b-base-vllm:
-    gsm8k: 22
-    race-middle: 35
-    race-high: 20
-
-gemma-2b-hf:
-    gsm8k: 19
-    race-middle: 33
-    race-high: 26
-
-gemma-7b-hf:
-    gsm8k: 65
-    race-middle: 59
-    race-high: 66
-
-gemma2-2b-hf:
-    gsm8k: 33
-    race-middle: 56
-    race-high: 58
-
-gemma2-9b-hf:
-    gsm8k: 70
-    race-middle: 82
-    race-high: 84
-
-internlm2_5-7b-hf:
-    gsm8k: 47
-    race-middle: 92
-    race-high: 91
-
-internlm2-7b-hf:
-    gsm8k: 65
-    race-middle: 77
-    race-high: 72
-
-internlm2-base-7b-hf:
-    gsm8k: 5
-    race-middle: 71
-    race-high: 74
-
-internlm2_5-7b-turbomind:
-    gsm8k: 73
-    race-middle: 90
-    race-high: 91
-
-internlm2-1.8b-turbomind:
-    gsm8k: 25
-    race-middle: 75
-    race-high: 72
-
-internlm2-7b-turbomind:
-    gsm8k: 67
-    race-middle: 78
-    race-high: 76
-
-internlm2-base-7b-turbomind:
-    gsm8k: 39
-    race-middle: 75
-    race-high: 81
-
-llama-2-7b-hf:
-    gsm8k: 17
-    race-middle: 32
-    race-high: 38
-
-llama-3-8b-hf:
-    gsm8k: 48
-    race-middle: 64
-    race-high: 70
-
-llama-3.1-8b-turbomind:
-    gsm8k: 57
-    race-middle: 67
-    race-high: 75
-
-llama-3-8b-turbomind:
-    gsm8k: 52
-    race-middle: 63
-    race-high: 70
-
-mistral-7b-v0.2-hf:
-    gsm8k: 43
-    race-middle: 42
-    race-high: 60
-
-mistral-7b-v0.3-hf:
-    gsm8k: 43
-    race-middle: 42
-    race-high: 60
-
-mistral-7b-v0.2-vllm:
-    gsm8k: 45
-    race-middle: 42
-    race-high: 58
-
-qwen1.5-moe-a2.7b-hf:
-    gsm8k: 64
-    race-middle: 78
-    race-high: 90
-
-qwen2-1.5b-hf:
-    gsm8k: 58
-    race-middle: 65
-    race-high: 78
-
-qwen2-0.5b-hf:
-    gsm8k: 35
-    race-middle: 52
-    race-high: 48
-
-qwen2-7b-hf:
-    gsm8k: 82
-    race-middle: 88
-    race-high: 89
-
-qwen2-1.5b-turbomind:
-    gsm8k: 57
-    race-middle: 64
-    race-high: 78
-
-qwen2-7b-turbomind:
-    gsm8k: 83
-    race-middle: 88
-    race-high: 88
-
-qwen1.5-0.5b-vllm:
-    gsm8k: 12
-    race-middle: 54
-    race-high: 59
-
-yi-1.5-6b-hf:
-    gsm8k: 59
-    race-middle: 81
-    race-high: 89
-
-yi-1.5-9b-hf:
-    gsm8k: 77
-    race-middle: 90
-    race-high: 90
+    gsm8k: 83.78
+    race-middle: 92.41
+    race-high: 90.37
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -0,0 +1,153 @@
+internlm2_5-7b-chat-hf_fullbench:
+    race-high: 93.75
+    ARC-c: 87.5
+    BoolQ: 81.25
+    drop: 81.25
+    GPQA_diamond: 25
+    math: 75
+    wikibench-wiki-single_choice_cncircular: 50
+    sanitized_mbpp: 68.75
+    ds1000: 16.96
+    gsm8k: 56.25
+    triviaqa_wiki_1shot: 50
+    nq_open_1shot: 25
+    hellaswag: 87.5
+    TheoremQA: 18.75
+    college: 12.5
+    college_knowledge: 87.5
+    bbh-logical_deduction_seven_objects: 50
+    bbh-multistep_arithmetic_two: 68.75
+    mmlu-other: 72.6
+    cmmlu-china-specific: 76.25
+    mmlu_pro_math: 25
+    ds1000_Pandas: 12.5
+    ds1000_Numpy: 0
+    ds1000_Tensorflow: 12.5
+    ds1000_Scipy: 18.75
+    ds1000_Sklearn: 18.75
+    ds1000_Pytorch: 12.5
+    ds1000_Matplotlib: 43.75
+    Alignbench总分: 0.65
+    Alignbench专业能力: 7.83
+    AlpacaEvaltotal: 0
+    AlpacaEvalhelpful_base: 0
+    CompassArenacompassarena_language: 60
+    CompassArenacompassarena_knowledge: 56
+    CompassArenacompassarena_reason_v2: 50
+    CompassArenacompassarena_math_v2: 53.5
+    CompassArenacompassarena_creationv2_zh: 48.75
+    Fofofofo_test_prompts: 1
+    followbenchHSR_AVG: 1
+    followbenchSSR_AVG: 1
+    followbenchHSR_L1: 1
+    followbenchHSR_L2: 1
+    followbenchHSR_L3: 1
+    followbenchHSR_L4: 1
+    followbenchHSR_L5: 1
+    followbenchSSR_L1: 1
+    followbenchSSR_L2: 1
+    followbenchSSR_L3: 1
+    followbenchSSR_L4: 1
+    followbenchSSR_L5: 1
+    MTBench101average: 8.1
+    Wildbenchscore: -3.3333333333333335
+
+internlm2_5-7b-chat-turbomind_fullbench:
+    race-high: 93.75
+    ARC-c: 87.5
+    BoolQ: 68.75
+    drop: 75
+    GPQA_diamond: 25
+    math: 75
+    wikibench-wiki-single_choice_cncircular: 25
+    sanitized_mbpp: 68.75
+    ds1000: 13.39
+    gsm8k: 68.75
+    triviaqa_wiki_1shot: 50
+    nq_open_1shot: 25
+    hellaswag: 81.25
+    TheoremQA: 6.25
+    college: 0
+    college_knowledge: 87.5
+    bbh-logical_deduction_seven_objects: 56.25
+    bbh-multistep_arithmetic_two: 68.75
+    mmlu-other: 74.04
+    cmmlu-china-specific: 76.25
+    mmlu_pro_math: 25
+    ds1000_Pandas: 0
+    ds1000_Numpy: 0
+    ds1000_Tensorflow: 12.5
+    ds1000_Scipy: 18.75
+    ds1000_Sklearn: 18.75
+    ds1000_Pytorch: 6.25
+    ds1000_Matplotlib: 37.5
+    Alignbench总分: 0.64
+    Alignbench专业能力: 7.6
+    AlpacaEvaltotal: 10
+    AlpacaEvalhelpful_base: 10
+    CompassArenacompassarena_language: 59
+    CompassArenacompassarena_knowledge: 57
+    CompassArenacompassarena_reason_v2: 49.5
+    CompassArenacompassarena_math_v2: 51
+    CompassArenacompassarena_creationv2_zh: 43.75
+    Fofofofo_test_prompts: 1
+    followbenchHSR_AVG: 1
+    followbenchSSR_AVG: 1
+    followbenchHSR_L1: 1
+    followbenchHSR_L2: 1
+    followbenchHSR_L3: 1
+    followbenchHSR_L4: 1
+    followbenchHSR_L5: 1
+    followbenchSSR_L1: 1
+    followbenchSSR_L2: 1
+    followbenchSSR_L3: 1
+    followbenchSSR_L4: 1
+    followbenchSSR_L5: 1
+    MTBench101average: 8.1
+    Wildbenchscore: -8.333333333333334
+
+internlm2_5-7b-hf_fullbench:
+    race-high: 100
+    ARC-c: 68.75
+    BoolQ: 87.5
+    GPQA_diamond: 62.5
+    drop: 62.5
+    math: 12.5
+    wikibench-wiki-single_choice_cncircular: 25
+    sanitized_mbpp: 56.25
+    gsm8k: 37.5
+    triviaqa_wiki_1shot: 43.75
+    nq_open_1shot: 43.75
+    winogrande: 75
+    hellaswag: 93.75
+    TheoremQA: 25
+    college: 12.5
+    college_knowledge: 87.5
+    bbh-logical_deduction_seven_objects: 43.75
+    bbh-multistep_arithmetic_two: 56.25
+    mmlu-other: 76.92
+    cmmlu-china-specific: 84.17
+    mmlu_pro_math: 18.75
+
+internlm2_5-7b-turbomind_fullbench:
+    race-high: 100
+    ARC-c: 68.75
+    BoolQ: 87.5
+    GPQA_diamond: 62.5
+    drop: 62.5
+    math: 18.75
+    wikibench-wiki-single_choice_cncircular: 25
+    sanitized_mbpp: 56.25
+    gsm8k: 68.75
+    triviaqa_wiki_1shot: 43.75
+    nq_open_1shot: 43.75
+    winogrande: 87.5
+    hellaswag: 93.75
+    TheoremQA: 31.25
+    college: 12.5
+    college_knowledge: 87.5
+    bbh-logical_deduction_seven_objects: 50
+    bbh-multistep_arithmetic_two: 56.25
+    mmlu-other: 76.92
+    cmmlu-china-specific: 84.17
+    mmlu_pro_math: 18.75
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@ -0,0 +1,459 @@
+baichuan2-7b-chat-hf:
+    gsm8k: 18.75
+    race-high: 78.12
+
+glm-4-9b-chat-hf:
+    gsm8k: 68.75
+    race-high: 90.62
+
+glm-4-9b-chat-turbomind:
+    gsm8k: 75.00
+    race-high: 90.62
+
+glm-4-9b-chat-vllm:
+    gsm8k: 65.62
+    race-high: 90.62
+
+deepseek-7b-chat-hf:
+    gsm8k: 46.88
+    race-high: 81.25
+
+deepseek-moe-16b-chat-hf:
+    gsm8k: 50
+    race-high: 68.75
+
+deepseek-7b-chat-vllm:
+    gsm8k: 43.75
+    race-high: 75
+
+gemma2-2b-it-hf:
+    gsm8k: 50
+    race-high: 71.88
+
+gemma2-9b-it-hf:
+    gsm8k: 71.88
+    race-high: 84.38
+
+gemma-2b-it-hf:
+    gsm8k: 3.12
+    race-high: 40.62
+
+gemma-7b-it-hf:
+    gsm8k: 40.62
+    race-high: 68.75
+
+gemma-2-9b-it-turbomind:
+    gsm8k: 68.75
+    race-high: 81.25
+
+gemma-7b-it-vllm:
+    gsm8k: 28.12
+    race-high: 68.75
+
+internlm2_5-7b-chat-hf:
+    gsm8k: 84.38
+    race-high: 90.62
+
+internlm2_5-7b-chat-turbomind:
+    gsm8k: 84.38
+    race-high: 90.62
+
+internlm2-chat-1.8b-turbomind:
+    gsm8k: 25
+    race-high: 84.38
+
+internlm2-chat-1.8b-sft-turbomind:
+    gsm8k: 21.88
+    race-high: 84.38
+
+internlm2-chat-7b-lmdeploy:
+    gsm8k: 53.12
+    race-high: 84.38
+
+internlm2-chat-7b-sft-turbomind:
+    gsm8k: 50
+    race-high: 90.62
+
+internlm2-chat-7b-vllm:
+    gsm8k: 43.75
+    race-high: 87.5
+
+llama-3_1-8b-instruct-hf:
+    gsm8k: 84.38
+    race-high: 90.62
+
+llama-3_2-3b-instruct-hf:
+    gsm8k: 65.62
+    race-high: 81.25
+
+llama-3-8b-instruct-hf:
+    gsm8k: 68.75
+    race-high: 87.5
+
+llama-3_1-8b-instruct-turbomind:
+    gsm8k: 78.12
+    race-high: 90.62
+
+llama-3_2-3b-instruct-turbomind:
+    gsm8k: 65.62
+    race-high: 81.25
+
+llama-3-8b-instruct-turbomind:
+    gsm8k: 68.75
+    race-high: 87.5
+
+mistral-7b-instruct-v0.2-hf:
+    gsm8k: 40.62
+    race-high: 75
+
+mistral-7b-instruct-v0.3-hf:
+    gsm8k: 40.62
+    race-high: 75
+
+mistral-nemo-instruct-2407-hf:
+    gsm8k: 75
+    race-high: 81.25
+
+mistral-nemo-instruct-2407-turbomind:
+    gsm8k: 75
+    race-high: 81.25
+
+mistral-7b-instruct-v0.1-vllm:
+    gsm8k: 37.5
+    race-high: 71.88
+
+mistral-7b-instruct-v0.2-vllm:
+    gsm8k: 43.75
+    race-high: 75
+
+MiniCPM3-4B-hf:
+    gsm8k: 68.75
+    race-high: 84.38
+
+minicpm-2b-dpo-fp32-hf:
+    gsm8k: 56.25
+    race-high: 56.25
+
+minicpm-2b-sft-bf16-hf:
+    gsm8k: 46.88
+    race-high: 65.62
+
+minicpm-2b-sft-fp32-hf:
+    gsm8k: 46.88
+    race-high: 65.62
+
+phi-3-mini-4k-instruct-hf:
+    gsm8k: 56.25
+    race-high: 78.12
+
+qwen1.5-0.5b-chat-hf:
+    gsm8k: 0
+    race-high: 53.12
+
+qwen2-1.5b-instruct-hf:
+    gsm8k: 62.5
+    race-high: 84.38
+
+qwen2-7b-instruct-hf:
+    gsm8k: 68.75
+    race-high: 90.62
+
+qwen2-1.5b-instruct-turbomind:
+    gsm8k: 62.50
+    race-high: 84.38
+
+qwen2-7b-instruct-turbomind:
+    gsm8k: 81.25
+    race-high: 87.5
+
+qwen1.5-0.5b-chat-vllm:
+    gsm8k: 3.12
+    race-high: 53.12
+
+yi-1.5-6b-chat-hf:
+    gsm8k: 65.62
+    race-high: 84.38
+
+yi-1.5-9b-chat-hf:
+    gsm8k: 75
+    race-high: 93.75
+
+deepseek-v2-lite-chat-hf:
+    gsm8k: 43.75
+    race-high: 71.88
+
+internlm2_5-20b-chat-hf:
+    gsm8k: 84.38
+    race-high: 87.5
+
+internlm2_5-20b-chat-turbomind:
+    gsm8k: 84.38
+    race-high: 87.5
+
+mistral-small-instruct-2409-hf:
+    gsm8k: 81.25
+    race-high: 90.62
+
+mistral-small-instruct-2409-turbomind:
+    gsm8k: 78.12
+    race-high: 90.62
+
+qwen2.5-14b-instruct-hf:
+    gsm8k: 71.88
+    race-high: 93.75
+
+qwen2.5-14b-instruct-turbomind:
+    gsm8k: 71.88
+    race-high: 93.75
+
+glm-4-9b-hf:
+    gsm8k: 68.75
+    GPQA_diamond: 31.25
+    race-high: 93.75
+    winogrande: 84.38
+
+deepseek-moe-16b-base-hf:
+    gsm8k: 21.88
+    GPQA_diamond: 0
+    race-high: 21.88
+    winogrande: 65.62
+
+deepseek-7b-base-turbomind:
+    gsm8k: 21.88
+    GPQA_diamond: 0
+    race-high: 46.88
+    winogrande: 84.38
+
+deepseek-moe-16b-base-vllm:
+    gsm8k: 21.88
+    GPQA_diamond: 0
+    race-high: 25
+    winogrande: 68.75
+
+gemma2-2b-hf:
+    gsm8k: 31.25
+    GPQA_diamond: 3.12
+    race-high: 56.25
+    winogrande: 71.88
+
+gemma2-9b-hf:
+    gsm8k: 68.75
+    GPQA_diamond: 0
+    race-high: 81.25
+    winogrande: 84.38
+
+gemma-2b-hf:
+    gsm8k: 18.75
+    GPQA_diamond: 3.12
+    race-high: 25
+    winogrande: 53.12
+
+gemma-7b-hf:
+    gsm8k: 56.25
+    GPQA_diamond: 6.25
+    race-high: 65.62
+    winogrande: 78.12
+
+gemma-2b-vllm:
+    gsm8k: 18.75
+    GPQA_diamond: 6.25
+    race-high:
+    winogrande:
+
+gemma-7b-vllm:
+    gsm8k: 59.38
+    GPQA_diamond: 6.25
+    race-high:
+    winogrande:
+
+internlm2_5-7b-hf:
+    gsm8k: 37.5
+    GPQA_diamond: 25
+    race-high: 93.75
+    winogrande: 71.88
+
+internlm2-7b-hf:
+    gsm8k: 53.12
+    GPQA_diamond: 18.75
+    race-high: 62.5
+    winogrande: 78.12
+
+internlm2-base-7b-hf:
+    gsm8k: 3.12
+    GPQA_diamond: 21.88
+    race-high: 75
+    winogrande: 65.62
+
+internlm2-1.8b-turbomind:
+    gsm8k: 12.5
+    GPQA_diamond: 12.5
+    race-high: 71.88
+    winogrande: 75
+
+internlm2_5-7b-turbomind:
+    gsm8k: 68.75
+    GPQA_diamond: 31.25
+    race-high: 93.75
+    winogrande: 84.38
+
+internlm2-7b-turbomind:
+    gsm8k: 56.25
+    GPQA_diamond: 21.88
+    race-high: 75
+    winogrande: 81.25
+
+internlm2-base-7b-turbomind:
+    gsm8k: 40.62
+    GPQA_diamond: 28.12
+    race-high: 84.38
+    winogrande: 71.88
+
+llama-2-7b-hf:
+    gsm8k: 21.88
+    GPQA_diamond: 21.88
+    race-high: 40.62
+    winogrande: 71.88
+
+llama-3_1-8b-hf:
+    gsm8k: 78.12
+    GPQA_diamond: 25
+    race-high: 90.62
+    winogrande: 62.5
+
+llama-3-8b-hf:
+    gsm8k: 46.88
+    GPQA_diamond: 6.25
+    race-high: 65.62
+    winogrande: 65.62
+
+llama-3.1-8b-turbomind:
+    gsm8k: 56.25
+    GPQA_diamond: 6.25
+    race-high: 78.12
+    winogrande: 78.12
+
+llama-3-8b-turbomind:
+    gsm8k: 50
+    GPQA_diamond: 9.38
+    race-high: 65.62
+    winogrande: 78.12
+
+mistral-7b-v0.2-hf:
+    gsm8k: 31.25
+    GPQA_diamond: 6.25
+    race-high: 62.5
+    winogrande: 59.38
+
+mistral-7b-v0.3-hf:
+    gsm8k: 31.25
+    GPQA_diamond: 6.25
+    race-high: 62.5
+    winogrande: 59.38
+
+mistral-7b-v0.2-vllm:
+    gsm8k: 34.38
+    GPQA_diamond: 6.25
+    race-high: 62.5
+    winogrande: 65.62
+
+qwen2.5-7b-hf:
+    gsm8k: 81.25
+    GPQA_diamond: 18.75
+    race-high: 87.5
+    winogrande: 71.88
+
+qwen2.5-1.5b-turbomind:
+    gsm8k: 71.88
+    GPQA_diamond: 15.62
+    race-high: 78.12
+    winogrande: 71.88
+
+qwen2.5-7b-turbomind:
+    gsm8k: 71.88
+    GPQA_diamond: 25
+    race-high: 87.5
+    winogrande: 71.88
+
+qwen1.5-moe-a2.7b-hf:
+    gsm8k: 62.5
+    GPQA_diamond: 18.75
+    race-high: 84.38
+    winogrande: 75
+
+qwen2-0.5b-hf:
+    gsm8k: 25
+    GPQA_diamond: 0
+    race-high: 40.62
+    winogrande: 62.5
+
+qwen2-1.5b-hf:
+    gsm8k: 59.38
+    GPQA_diamond: 9.38
+    race-high: 81.25
+    winogrande: 62.5
+
+qwen2-7b-hf:
+    gsm8k: 68.75
+    GPQA_diamond: 9.38
+    race-high: 87.5
+    winogrande: 68.75
+
+qwen2-1.5b-turbomind:
+    gsm8k: 62.50
+    GPQA_diamond: 6.25
+    race-high: 81.25
+    winogrande: 75
+
+qwen2-7b-turbomind:
+    gsm8k: 68.75
+    GPQA_diamond: 12.5
+    race-high: 87.5
+    winogrande: 71.88
+
+qwen1.5-0.5b-vllm:
+    gsm8k: 9.38
+    GPQA_diamond: 0
+    race-high: 56.25
+    winogrande: 62.5
+
+yi-1.5-6b-hf:
+    gsm8k: 62.5
+    GPQA_diamond: 3.12
+    race-high: 87.5
+    winogrande: 62.5
+
+yi-1.5-9b-hf:
+    gsm8k: 75
+    GPQA_diamond: 40.62
+    race-high: 87.5
+    winogrande: 59.38
+
+deepseek-v2-lite-hf:
+    gsm8k: 28.12
+    GPQA_diamond: 21.88
+    race-high: 59.38
+    winogrande: 75
+
+internlm2-20b-hf:
+    gsm8k: 56.25
+    GPQA_diamond: 15.62
+    race-high: 68.75
+    winogrande: 75
+
+internlm2-base-20b-hf:
+    gsm8k: 12.5
+    GPQA_diamond: 9.38
+    race-high: 84.38
+    winogrande: 65.62
+
+internlm2-20b-turbomind:
+    gsm8k: 68.75
+    GPQA_diamond: 15.62
+    race-high: 68.75
+    winogrande: 81.25
+
+qwen2.5-14b-hf:
+    gsm8k: 75
+    GPQA_diamond: 37.5
+    race-high: 93.75
+    winogrande: 84.38
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -13,11 +13,31 @@ on:
        description: 'Set branch or tag or commit id. Default is "main"'
        type: string
        default: 'main'
+      build_lmdeploy:
+        required: false
+        description: 'whether to build lmdeploy'
+        type:  boolean
+        default: false
+      repo_org_lmdeploy:
+        required: false
+        description: 'Tested repository organization name. Default is internlm/lmdeploy'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref_lmdeploy:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
      regression_func:
        required: true
        description: 'regression functions'
        type: string
-        default: "['chat','base','cmd']"
+        default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']"
+      cuda_env:
+        required: true
+        description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']"
+        type: string
+        default: "['dsw_cu12']"
  schedule:
    - cron:  '56 16 * * *'

@ -31,7 +51,7 @@ env:
  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
  HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
  HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
-  DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
+  COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache
  HF_DATASETS_OFFLINE: 1
  HF_EVALUATE_OFFLINE: 1
  TRANSFORMERS_OFFLINE: 1
@ -39,6 +59,8 @@ env:
  LMDEPLOY_USE_MODELSCOPE: false
  HF_HUB_OFFLINE: 1
  TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
+  REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report
+  OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}

 jobs:
  build-pypi:
@ -64,16 +86,51 @@ jobs:
          retention-days: 1
          name: my-artifact-${{ github.run_id }}

-  daily_run_test:
+  build-pypi-lmdeploy:
+    if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.1
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+
+  prepare_env:
    if: ${{!cancelled()}}
-    needs: build-pypi
+    needs: ['build-pypi', 'build-pypi-lmdeploy']
    strategy:
      fail-fast: false
      matrix:
-        cuda_env: [dsw_cu11, dsw_cu12]
+        cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
    runs-on: ${{ matrix.cuda_env }}
    environment: 'prod'
-    timeout-minutes: 600 #10hours
+    timeout-minutes: 240 #4hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
@ -84,89 +141,169 @@ jobs:
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}
+      - name:  Remove Conda Env
+        if: always()
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda info --envs
      - name: Prepare - create conda env and install torch - cu11
        if: ${{matrix.cuda_env == 'dsw_cu11'}}
-        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
-          pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip uninstall torch torchvision torchaudio -y
-          pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
-          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          conda info --envs
-          pip list
+        uses: nick-fields/retry@v3
+        id: retry1
+        with:
+          max_attempts: 3
+          timeout_minutes: 40
+          command: |
+            . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+            conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
+            conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+            pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip uninstall torch torchvision torchaudio -y
+            pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
+            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            conda info --envs
+            pip list
      - name: Prepare - create conda env and install torch - cu12
        if: ${{matrix.cuda_env == 'dsw_cu12'}}
+        uses: nick-fields/retry@v3
+        id: retry2
+        with:
+          max_attempts: 3
+          timeout_minutes: 40
+          command: |
+            . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+            conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
+            conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+            pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
+            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            conda info --envs
+            pip list
+      - name: Prepare - reinstall lmdeploy - cu12
+        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Prepare - reinstall lmdeploy - cu12
+        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
-          pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip uninstall torch torchvision torchaudio -y
-          pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
-          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          conda info --envs
-          pip list
+          pip install lmdeploy-*.whl --no-deps
+
+  daily_run_test:
+    if: ${{!cancelled()}}
+    needs: prepare_env
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
+        regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}}
+    runs-on: ${{ matrix.cuda_env }}
+    environment: 'prod'
+    timeout-minutes: 240 #4hours
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: Prepare - prepare data and hf model
        run: |
-          ln -s ${{env.DATEASET_CACHE_PATH}} data
          rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
      - name:  Run command testcase
-        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'cmd')
+        if: matrix.regression_func == 'cmd'
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
          export from_tf=TRUE
          python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run chat model test
-        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'chat')
+        if: matrix.regression_func == 'chat_models'
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
-          sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
-          opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run base model test
-        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'base')
+        if: matrix.regression_func == 'base_models'
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
-          opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
-      - name:  Remove Conda Env
-        if: always()
+      - name:  Run chat model test - fullbench
+        if: matrix.regression_func == 'chat_obj_fullbench'
        run: |
-          rm -rf regression_result_daily
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
+          opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Run chat model test - fullbench
+        if: matrix.regression_func == 'chat_sub_fullbench'
+        env:
+          COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda info --envs
+          opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Run base model test - fullbench
+        if: matrix.regression_func == 'base_fullbench'
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda info --envs
+          opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Run model test - api
+        if: matrix.regression_func == 'api'
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          conda info --envs
+          lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 120s
+          opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Run model test - api kill
+        if: always() && matrix.regression_func == 'api'
+        run: |
+          kill -15 "$restful_pid"

  notify_to_feishu:
    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}