[CI] update torch version and add more datasets into daily testcase (#1701)

* update * update * update * update * update * update * update * update * update * update --------- Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
2025-05-30 16:03:24 +08:00 · 2024-11-21 10:37:33 +08:00 · 2024-11-21 10:37:33 +08:00 · ed81f9df30
commit ed81f9df30
parent 05044dfaf2
7 changed files with 160 additions and 77 deletions
--- a/.github/scripts/eval_regression_base_fullbench.py
+++ b/.github/scripts/eval_regression_base_fullbench.py
@ -7,6 +7,8 @@ with read_base():
        bbh_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
        cmmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.dingo.dingo_gen import \
+        datasets as dingo_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.drop.drop_gen_a2697c import \
        drop_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
@ -120,6 +122,8 @@ summarizer = dict(
        ['winogrande', 'accuracy'],
        ['hellaswag', 'accuracy'],
        ['TheoremQA', 'score'],
+        ['dingo_en_192', 'score'],
+        ['dingo_zh_170', 'score'],
        '###### MathBench-A: Application Part ######',
        'college',
        'high',
--- a/.github/scripts/eval_regression_chat.py
+++ b/.github/scripts/eval_regression_chat.py
@ -59,6 +59,8 @@ with read_base():
        models as hf_llama3_2_3b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
        models as hf_llama3_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \
+        models as lmdeploy_llama2_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
        models as lmdeploy_llama3_1_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
--- a/.github/scripts/eval_regression_chat_objective_fullbench.py
+++ b/.github/scripts/eval_regression_chat_objective_fullbench.py
@ -3,12 +3,16 @@ from mmengine.config import read_base
 with read_base():
    # read hf models - chat models
    # Dataset
+    from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \
+        aime2024_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
        ARC_c_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
        bbh_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
        cmmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
+        cmo_fib_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
        drop_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \
@ -28,6 +32,8 @@ with read_base():
        humanevalx_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
        ifeval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
+        LCB_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
        math_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
@ -38,6 +44,10 @@ with read_base():
        mmlu_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
        mmlu_pro_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \
+        mmmlu_lite_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.musr.musr_gen_3c6e15 import \
+        musr_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
        nq_datasets  # noqa: F401, E501
    from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
@ -77,10 +87,14 @@ with read_base():
        mmlu_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.mmlu_pro import \
        mmlu_pro_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.musr_average import \
+        summarizer as musr_summarizer  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.scicode import \
        scicode_summary_groups  # noqa: F401, E501
    from opencompass.configs.summarizers.groups.teval import \
        teval_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.mmmlu_lite import \
+        mmmlu_summary_groups  # noqa: F401, E501

 # For HumanEval-X Evaluation
 # Apply the evaluator ip_address and port
@ -122,6 +136,10 @@ mmlu_datasets = [
 ]

 mmlu_pro_datasets = [mmlu_pro_datasets[0]]
+
+mmmlu_lite_datasets = [
+    x for x in mmmlu_lite_datasets if 'mmlu_lite_AR-XY' in x['abbr']
+]
 mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
 GaokaoBench_datasets = [
    x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
@ -137,52 +155,68 @@ datasets += teval_en_datasets
 datasets += teval_zh_datasets
 # datasets += SciCode_datasets

+musr_summary_groups = musr_summarizer['summary_groups']
+summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
+summary_groups.append(
+    {
+        'name': 'Mathbench',
+        'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
+    }, )
+
+# Summarizer
 summarizer = dict(
    dataset_abbrs=[
+        'Language',
        ['race-high', 'accuracy'],
        ['ARC-c', 'accuracy'],
        ['BoolQ', 'accuracy'],
-        ['mmlu_pro', 'naive_average'],
+        ['triviaqa_wiki_1shot', 'score'],
+        ['nq_open_1shot', 'score'],
+        ['mmmlu_lite', 'naive_average'],
+        '',
+        'Instruction Following',
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        '',
+        'General Reasoning',
        ['drop', 'accuracy'],
        ['bbh', 'naive_average'],
        ['GPQA_diamond', 'accuracy'],
+        ['hellaswag', 'accuracy'],
+        ['TheoremQA', 'score'],
+        ['musr_average', 'naive_average'],
+        '',
+        'Math Calculation',
+        ['gsm8k', 'accuracy'],
+        ['GaokaoBench', 'weighted_average'],
        ['math', 'accuracy'],
+        ['cmo_fib', 'accuracy'],
+        ['aime2024', 'accuracy'],
+        ['Mathbench', 'naive_average'],
+        '',
+        'Knowledge',
        ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
-        ['openai_humaneval', 'humaneval_pass@1'],
-        ['sanitized_mbpp', 'score'],
        ['cmmlu', 'naive_average'],
        ['mmlu', 'naive_average'],
+        ['mmlu_pro', 'naive_average'],
+        '',
+        'Code',
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['sanitized_mbpp', 'score'],
+        ['humanevalx', 'naive_average'],
+        ['ds1000', 'naive_average'],
+        ['lcb_code_generation', 'pass@1'],
+        ['lcb_code_execution', 'pass@1'],
+        ['lcb_test_output', 'pass@1'],
+        '',
+        'Agent',
        ['teval', 'naive_average'],
        ['SciCode', 'accuracy'],
        ['SciCode', 'sub_accuracy'],
-        ['humanevalx', 'naive_average'],
-        ['ds1000', 'naive_average'],
-        ['IFEval', 'Prompt-level-strict-accuracy'],
-        ['gsm8k', 'accuracy'],
-        ['GaokaoBench', 'weighted_average'],
-        ['triviaqa_wiki_1shot', 'score'],
-        ['nq_open_1shot', 'score'],
-        ['hellaswag', 'accuracy'],
-        ['TheoremQA', 'score'],
-        '###### MathBench-A: Application Part ######',
-        'college',
-        'high',
-        'middle',
-        'primary',
-        'arithmetic',
-        'mathbench-a (average)',
-        '###### MathBench-T: Theory Part ######',
-        'college_knowledge',
-        'high_knowledge',
-        'middle_knowledge',
-        'primary_knowledge',
-        'mathbench-t (average)',
-        '###### Overall: Average between MathBench-A and MathBench-T ######',
-        'Overall',
        '',
        'bbh-logical_deduction_seven_objects',
        'bbh-multistep_arithmetic_two',
-        ''
+        '',
        'mmlu',
        'mmlu-stem',
        'mmlu-social-science',
@ -212,15 +246,6 @@ summarizer = dict(
        'mmlu_pro_psychology',
        'mmlu_pro_other',
        '',
-        'GaokaoBench_2010-2022_Math_II_MCQs',
-        'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
-        '',
-        'humanevalx-python',
-        'humanevalx-cpp',
-        'humanevalx-go',
-        'humanevalx-java',
-        'humanevalx-js',
-        '',
        'ds1000_Pandas',
        'ds1000_Numpy',
        'ds1000_Tensorflow',
@ -228,9 +253,38 @@ summarizer = dict(
        'ds1000_Sklearn',
        'ds1000_Pytorch',
        'ds1000_Matplotlib',
+        '',
+        'mmmlu_lite',
+        'openai_mmmlu_lite_AR-XY',
+        'openai_mmmlu_lite_BN-BD',
+        'openai_mmmlu_lite_DE-DE',
+        'openai_mmmlu_lite_ES-LA',
+        'openai_mmmlu_lite_FR-FR',
+        'openai_mmmlu_lite_HI-IN',
+        'openai_mmmlu_lite_ID-ID',
+        'openai_mmmlu_lite_IT-IT',
+        'openai_mmmlu_lite_JA-JP',
+        'openai_mmmlu_lite_KO-KR',
+        'openai_mmmlu_lite_PT-BR',
+        'openai_mmmlu_lite_SW-KE',
+        'openai_mmmlu_lite_YO-NG',
+        'openai_mmmlu_lite_ZH-CN',
+        '',
+        '###### MathBench-A: Application Part ######',
+        'college',
+        'high',
+        'middle',
+        'primary',
+        'arithmetic',
+        'mathbench-a (average)',
+        '###### MathBench-T: Theory Part ######',
+        'college_knowledge',
+        'high_knowledge',
+        'middle_knowledge',
+        'primary_knowledge',
+        'mathbench-t (average)',
    ],
-    summary_groups=sum(
-        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+    summary_groups=summary_groups,
 )

 for d in datasets:
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -131,14 +131,16 @@ class TestChatObjFullbench:
        'internlm2_5-7b-chat-hf_fullbench',
        'internlm2_5-7b-chat-turbomind_fullbench'
    ] for p2 in [
-        'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
+        'race-high', 'ARC-c', 'BoolQ', 'triviaqa_wiki_1shot', 'nq_open_1shot',
+        'IFEval', 'drop', 'GPQA_diamond', 'hellaswag', 'TheoremQA',
+        'musr_average', 'gsm8k', 'math', 'cmo_fib', 'aime2024',
        'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000',
-        'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'hellaswag',
-        'TheoremQA', 'college', 'college_knowledge',
+        'lcb_code_generation', 'lcb_code_execution', 'lcb_test_output',
        'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
        'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas',
        'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn',
-        'ds1000_Pytorch', 'ds1000_Matplotlib'
+        'ds1000_Pytorch', 'ds1000_Matplotlib', 'openai_mmmlu_lite_AR-XY',
+        'college', 'college_knowledge'
    ]])
    def test_model_dataset_score(self, baseline_scores_fullbench,
                                 result_scores, model, dataset):
@ -188,9 +190,10 @@ class TestBaseFullbench:
        'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
        'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k',
        'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag',
-        'TheoremQA', 'college', 'college_knowledge',
-        'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
-        'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math'
+        'TheoremQA', 'dingo_en_192', 'dingo_zh_170', 'college',
+        'college_knowledge', 'bbh-logical_deduction_seven_objects',
+        'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific',
+        'mmlu_pro_math'
    ]])
    def test_model_dataset_score(self, baseline_scores_fullbench,
                                 result_scores, model, dataset):
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -2,19 +2,24 @@ internlm2_5-7b-chat-hf_fullbench:
    race-high: 93.75
    ARC-c: 93.75
    BoolQ: 81.25
+    triviaqa_wiki_1shot: 50
+    nq_open_1shot: 25
+    IFEval: 50
    drop: 81.25
    GPQA_diamond: 25
+    hellaswag: 87.5
+    TheoremQA: 18.75
+    musr_average: 39.58
+    gsm8k: 56.25
    math: 75
+    cmo_fib: 6.25
+    aime2024: 6.25
    wikibench-wiki-single_choice_cncircular: 50
    sanitized_mbpp: 68.75
    ds1000: 16.96
-    gsm8k: 56.25
-    triviaqa_wiki_1shot: 50
-    nq_open_1shot: 25
-    hellaswag: 87.5
-    TheoremQA: 18.75
-    college: 12.5
-    college_knowledge: 87.5
+    lcb_code_generation: 12.5
+    lcb_code_execution: 43.75
+    lcb_test_output: 18.75
    bbh-logical_deduction_seven_objects: 50
    bbh-multistep_arithmetic_two: 68.75
    mmlu-other: 72.6
@ -27,6 +32,9 @@ internlm2_5-7b-chat-hf_fullbench:
    ds1000_Sklearn: 18.75
    ds1000_Pytorch: 12.5
    ds1000_Matplotlib: 43.75
+    openai_mmmlu_lite_AR-XY: 37.5
+    college: 12.5
+    college_knowledge: 87.5
    Alignbench总分: 0.65
    Alignbench专业能力: 7.83
    AlpacaEvaltotal: 0
@ -56,19 +64,24 @@ internlm2_5-7b-chat-turbomind_fullbench:
    race-high: 93.75
    ARC-c: 87.5
    BoolQ: 68.75
+    triviaqa_wiki_1shot: 50
+    nq_open_1shot: 25
+    IFEval: 50
    drop: 75
-    GPQA_diamond: 25
+    hellaswag: 81.25
+    TheoremQA: 6.25
+    musr_average: 39.58
+    gsm8k: 68.75
    math: 75
+    GPQA_diamond: 25
+    cmo_fib: 6.25
+    aime2024: 6.25
    wikibench-wiki-single_choice_cncircular: 25
    sanitized_mbpp: 68.75
    ds1000: 13.39
-    gsm8k: 68.75
-    triviaqa_wiki_1shot: 50
-    nq_open_1shot: 25
-    hellaswag: 81.25
-    TheoremQA: 6.25
-    college: 0
-    college_knowledge: 87.5
+    lcb_code_generation: 12.5
+    lcb_code_execution: 43.75
+    lcb_test_output: 12.5
    bbh-logical_deduction_seven_objects: 56.25
    bbh-multistep_arithmetic_two: 68.75
    mmlu-other: 74.04
@ -81,6 +94,9 @@ internlm2_5-7b-chat-turbomind_fullbench:
    ds1000_Sklearn: 18.75
    ds1000_Pytorch: 6.25
    ds1000_Matplotlib: 37.5
+    openai_mmmlu_lite_AR-XY: 37.5
+    college: 0
+    college_knowledge: 87.5
    Alignbench总分: 0.64
    Alignbench专业能力: 7.6
    AlpacaEvaltotal: 10
@ -121,6 +137,8 @@ internlm2_5-7b-hf_fullbench:
    winogrande: 75
    hellaswag: 93.75
    TheoremQA: 25
+    dingo_en_192: 37.5
+    dingo_zh_170: 100
    college: 12.5
    college_knowledge: 87.5
    bbh-logical_deduction_seven_objects: 43.75
@ -144,6 +162,8 @@ internlm2_5-7b-turbomind_fullbench:
    winogrande: 87.5
    hellaswag: 93.75
    TheoremQA: 31.25
+    dingo_en_192: 43.75
+    dingo_zh_170: 100
    college: 12.5
    college_knowledge: 87.5
    bbh-logical_deduction_seven_objects: 50
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@ -43,11 +43,11 @@ gemma-7b-it-hf:
    race-high: 68.75

 gemma-2-9b-it-turbomind:
-    gsm8k: 68.75
+    gsm8k: 65.62
    race-high: 84.38

 gemma-7b-it-vllm:
-    gsm8k: 28.12
+    gsm8k: 34.38
    race-high: 68.75

 internlm2_5-7b-chat-hf:
@ -95,7 +95,7 @@ llama-3_1-8b-instruct-turbomind:
    race-high: 90.62

 llama-3_2-3b-instruct-turbomind:
-    gsm8k: 65.62
+    gsm8k: 62.50
    race-high: 81.25

 llama-3-8b-instruct-turbomind:
@ -112,15 +112,15 @@ mistral-7b-instruct-v0.3-hf:

 mistral-nemo-instruct-2407-hf:
    gsm8k: 75
-    race-high: 84.38
+    race-high: 81.25

 mistral-nemo-instruct-2407-turbomind:
-    gsm8k: 75
-    race-high: 84.38
+    gsm8k: 68.75
+    race-high: 87.50

 mistral-7b-instruct-v0.1-vllm:
-    gsm8k: 37.5
-    race-high: 71.88
+    gsm8k: 34.38
+    race-high: 68.75

 mistral-7b-instruct-v0.2-vllm:
    gsm8k: 43.75
@ -255,13 +255,13 @@ gemma-7b-hf:
    winogrande: 78.12

 gemma-2b-vllm:
-    gsm8k: 18.75
+    gsm8k: 15.62
    GPQA_diamond: 6.25
    race-high:
    winogrande:

 gemma-7b-vllm:
-    gsm8k: 59.38
+    gsm8k: 53.12
    GPQA_diamond: 6.25
    race-high:
    winogrande:
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -163,9 +163,9 @@ jobs:
            pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            pip uninstall torch torchvision torchaudio -y
-            pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
-            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
+            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            conda info --envs
            pip list
      - name: Prepare - create conda env and install torch - cu12
@ -183,9 +183,9 @@ jobs:
            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
-            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
+            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            conda info --envs
            pip list
      - name: Prepare - reinstall lmdeploy - cu12