From ed81f9df302b6c9d61d5167e7c98bbedd238a09d Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Thu, 21 Nov 2024 10:37:33 +0800
Subject: [PATCH] [CI] update torch version and add more datasets into daily
 testcase (#1701)

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
---
 .../scripts/eval_regression_base_fullbench.py |   4 +
 .github/scripts/eval_regression_chat.py       |   2 +
 ...val_regression_chat_objective_fullbench.py | 132 ++++++++++++------
 .github/scripts/oc_score_assert.py            |  17 ++-
 .../scripts/oc_score_baseline_fullbench.yaml  |  50 +++++--
 .../scripts/oc_score_baseline_testrange.yaml  |  20 +--
 .github/workflows/daily-run-test.yml          |  12 +-
 7 files changed, 160 insertions(+), 77 deletions(-)

diff --git a/.github/scripts/eval_regression_base_fullbench.py b/.github/scripts/eval_regression_base_fullbench.py
index d5ad48c4..11c2f514 100644
--- a/.github/scripts/eval_regression_base_fullbench.py
+++ b/.github/scripts/eval_regression_base_fullbench.py
@@ -7,6 +7,8 @@ with read_base():
         bbh_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
         cmmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.dingo.dingo_gen import \
+        datasets as dingo_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.drop.drop_gen_a2697c import \
         drop_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
@@ -120,6 +122,8 @@ summarizer = dict(
         ['winogrande', 'accuracy'],
         ['hellaswag', 'accuracy'],
         ['TheoremQA', 'score'],
+        ['dingo_en_192', 'score'],
+        ['dingo_zh_170', 'score'],
         '###### MathBench-A: Application Part ######',
         'college',
         'high',
diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat.py
index 68c225c5..7762e4f7 100644
--- a/.github/scripts/eval_regression_chat.py
+++ b/.github/scripts/eval_regression_chat.py
@@ -59,6 +59,8 @@ with read_base():
         models as hf_llama3_2_3b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
         models as hf_llama3_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \
+        models as lmdeploy_llama2_7b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
         models as lmdeploy_llama3_1_8b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
diff --git a/.github/scripts/eval_regression_chat_objective_fullbench.py b/.github/scripts/eval_regression_chat_objective_fullbench.py
index ff8dfba4..c66fba33 100644
--- a/.github/scripts/eval_regression_chat_objective_fullbench.py
+++ b/.github/scripts/eval_regression_chat_objective_fullbench.py
@@ -3,12 +3,16 @@ from mmengine.config import read_base
 with read_base():
     # read hf models - chat models
     # Dataset
+    from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \
+        aime2024_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
         ARC_c_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
         bbh_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
         cmmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
+        cmo_fib_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
         drop_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \
@@ -28,6 +32,8 @@ with read_base():
         humanevalx_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
         ifeval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
+        LCB_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
         math_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
@@ -38,6 +44,10 @@ with read_base():
         mmlu_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
         mmlu_pro_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \
+        mmmlu_lite_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.musr.musr_gen_3c6e15 import \
+        musr_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
         nq_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
@@ -77,10 +87,14 @@ with read_base():
         mmlu_summary_groups  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.mmlu_pro import \
         mmlu_pro_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.musr_average import \
+        summarizer as musr_summarizer  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.scicode import \
         scicode_summary_groups  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.teval import \
         teval_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.mmmlu_lite import \
+        mmmlu_summary_groups  # noqa: F401, E501
 
 # For HumanEval-X Evaluation
 # Apply the evaluator ip_address and port
@@ -122,6 +136,10 @@ mmlu_datasets = [
 ]
 
 mmlu_pro_datasets = [mmlu_pro_datasets[0]]
+
+mmmlu_lite_datasets = [
+    x for x in mmmlu_lite_datasets if 'mmlu_lite_AR-XY' in x['abbr']
+]
 mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
 GaokaoBench_datasets = [
     x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
@@ -137,52 +155,68 @@ datasets += teval_en_datasets
 datasets += teval_zh_datasets
 # datasets += SciCode_datasets
 
+musr_summary_groups = musr_summarizer['summary_groups']
+summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
+summary_groups.append(
+    {
+        'name': 'Mathbench',
+        'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
+    }, )
+
+# Summarizer
 summarizer = dict(
     dataset_abbrs=[
+        'Language',
         ['race-high', 'accuracy'],
         ['ARC-c', 'accuracy'],
         ['BoolQ', 'accuracy'],
-        ['mmlu_pro', 'naive_average'],
+        ['triviaqa_wiki_1shot', 'score'],
+        ['nq_open_1shot', 'score'],
+        ['mmmlu_lite', 'naive_average'],
+        '',
+        'Instruction Following',
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        '',
+        'General Reasoning',
         ['drop', 'accuracy'],
         ['bbh', 'naive_average'],
         ['GPQA_diamond', 'accuracy'],
+        ['hellaswag', 'accuracy'],
+        ['TheoremQA', 'score'],
+        ['musr_average', 'naive_average'],
+        '',
+        'Math Calculation',
+        ['gsm8k', 'accuracy'],
+        ['GaokaoBench', 'weighted_average'],
         ['math', 'accuracy'],
+        ['cmo_fib', 'accuracy'],
+        ['aime2024', 'accuracy'],
+        ['Mathbench', 'naive_average'],
+        '',
+        'Knowledge',
         ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
-        ['openai_humaneval', 'humaneval_pass@1'],
-        ['sanitized_mbpp', 'score'],
         ['cmmlu', 'naive_average'],
         ['mmlu', 'naive_average'],
+        ['mmlu_pro', 'naive_average'],
+        '',
+        'Code',
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['sanitized_mbpp', 'score'],
+        ['humanevalx', 'naive_average'],
+        ['ds1000', 'naive_average'],
+        ['lcb_code_generation', 'pass@1'],
+        ['lcb_code_execution', 'pass@1'],
+        ['lcb_test_output', 'pass@1'],
+        '',
+        'Agent',
         ['teval', 'naive_average'],
         ['SciCode', 'accuracy'],
         ['SciCode', 'sub_accuracy'],
-        ['humanevalx', 'naive_average'],
-        ['ds1000', 'naive_average'],
-        ['IFEval', 'Prompt-level-strict-accuracy'],
-        ['gsm8k', 'accuracy'],
-        ['GaokaoBench', 'weighted_average'],
-        ['triviaqa_wiki_1shot', 'score'],
-        ['nq_open_1shot', 'score'],
-        ['hellaswag', 'accuracy'],
-        ['TheoremQA', 'score'],
-        '###### MathBench-A: Application Part ######',
-        'college',
-        'high',
-        'middle',
-        'primary',
-        'arithmetic',
-        'mathbench-a (average)',
-        '###### MathBench-T: Theory Part ######',
-        'college_knowledge',
-        'high_knowledge',
-        'middle_knowledge',
-        'primary_knowledge',
-        'mathbench-t (average)',
-        '###### Overall: Average between MathBench-A and MathBench-T ######',
-        'Overall',
         '',
         'bbh-logical_deduction_seven_objects',
         'bbh-multistep_arithmetic_two',
-        ''
+        '',
         'mmlu',
         'mmlu-stem',
         'mmlu-social-science',
@@ -212,15 +246,6 @@ summarizer = dict(
         'mmlu_pro_psychology',
         'mmlu_pro_other',
         '',
-        'GaokaoBench_2010-2022_Math_II_MCQs',
-        'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
-        '',
-        'humanevalx-python',
-        'humanevalx-cpp',
-        'humanevalx-go',
-        'humanevalx-java',
-        'humanevalx-js',
-        '',
         'ds1000_Pandas',
         'ds1000_Numpy',
         'ds1000_Tensorflow',
@@ -228,9 +253,38 @@ summarizer = dict(
         'ds1000_Sklearn',
         'ds1000_Pytorch',
         'ds1000_Matplotlib',
+        '',
+        'mmmlu_lite',
+        'openai_mmmlu_lite_AR-XY',
+        'openai_mmmlu_lite_BN-BD',
+        'openai_mmmlu_lite_DE-DE',
+        'openai_mmmlu_lite_ES-LA',
+        'openai_mmmlu_lite_FR-FR',
+        'openai_mmmlu_lite_HI-IN',
+        'openai_mmmlu_lite_ID-ID',
+        'openai_mmmlu_lite_IT-IT',
+        'openai_mmmlu_lite_JA-JP',
+        'openai_mmmlu_lite_KO-KR',
+        'openai_mmmlu_lite_PT-BR',
+        'openai_mmmlu_lite_SW-KE',
+        'openai_mmmlu_lite_YO-NG',
+        'openai_mmmlu_lite_ZH-CN',
+        '',
+        '###### MathBench-A: Application Part ######',
+        'college',
+        'high',
+        'middle',
+        'primary',
+        'arithmetic',
+        'mathbench-a (average)',
+        '###### MathBench-T: Theory Part ######',
+        'college_knowledge',
+        'high_knowledge',
+        'middle_knowledge',
+        'primary_knowledge',
+        'mathbench-t (average)',
     ],
-    summary_groups=sum(
-        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+    summary_groups=summary_groups,
 )
 
 for d in datasets:
diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py
index d8e33adb..179dec27 100644
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@@ -131,14 +131,16 @@ class TestChatObjFullbench:
         'internlm2_5-7b-chat-hf_fullbench',
         'internlm2_5-7b-chat-turbomind_fullbench'
     ] for p2 in [
-        'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
+        'race-high', 'ARC-c', 'BoolQ', 'triviaqa_wiki_1shot', 'nq_open_1shot',
+        'IFEval', 'drop', 'GPQA_diamond', 'hellaswag', 'TheoremQA',
+        'musr_average', 'gsm8k', 'math', 'cmo_fib', 'aime2024',
         'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000',
-        'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'hellaswag',
-        'TheoremQA', 'college', 'college_knowledge',
+        'lcb_code_generation', 'lcb_code_execution', 'lcb_test_output',
         'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
         'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas',
         'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn',
-        'ds1000_Pytorch', 'ds1000_Matplotlib'
+        'ds1000_Pytorch', 'ds1000_Matplotlib', 'openai_mmmlu_lite_AR-XY',
+        'college', 'college_knowledge'
     ]])
     def test_model_dataset_score(self, baseline_scores_fullbench,
                                  result_scores, model, dataset):
@@ -188,9 +190,10 @@ class TestBaseFullbench:
         'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
         'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k',
         'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag',
-        'TheoremQA', 'college', 'college_knowledge',
-        'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
-        'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math'
+        'TheoremQA', 'dingo_en_192', 'dingo_zh_170', 'college',
+        'college_knowledge', 'bbh-logical_deduction_seven_objects',
+        'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific',
+        'mmlu_pro_math'
     ]])
     def test_model_dataset_score(self, baseline_scores_fullbench,
                                  result_scores, model, dataset):
diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml
index c95e7b91..413a99a3 100644
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@@ -2,19 +2,24 @@ internlm2_5-7b-chat-hf_fullbench:
     race-high: 93.75
     ARC-c: 93.75
     BoolQ: 81.25
+    triviaqa_wiki_1shot: 50
+    nq_open_1shot: 25
+    IFEval: 50
     drop: 81.25
     GPQA_diamond: 25
+    hellaswag: 87.5
+    TheoremQA: 18.75
+    musr_average: 39.58
+    gsm8k: 56.25
     math: 75
+    cmo_fib: 6.25
+    aime2024: 6.25
     wikibench-wiki-single_choice_cncircular: 50
     sanitized_mbpp: 68.75
     ds1000: 16.96
-    gsm8k: 56.25
-    triviaqa_wiki_1shot: 50
-    nq_open_1shot: 25
-    hellaswag: 87.5
-    TheoremQA: 18.75
-    college: 12.5
-    college_knowledge: 87.5
+    lcb_code_generation: 12.5
+    lcb_code_execution: 43.75
+    lcb_test_output: 18.75
     bbh-logical_deduction_seven_objects: 50
     bbh-multistep_arithmetic_two: 68.75
     mmlu-other: 72.6
@@ -27,6 +32,9 @@ internlm2_5-7b-chat-hf_fullbench:
     ds1000_Sklearn: 18.75
     ds1000_Pytorch: 12.5
     ds1000_Matplotlib: 43.75
+    openai_mmmlu_lite_AR-XY: 37.5
+    college: 12.5
+    college_knowledge: 87.5
     Alignbench总分: 0.65
     Alignbench专业能力: 7.83
     AlpacaEvaltotal: 0
@@ -56,19 +64,24 @@ internlm2_5-7b-chat-turbomind_fullbench:
     race-high: 93.75
     ARC-c: 87.5
     BoolQ: 68.75
+    triviaqa_wiki_1shot: 50
+    nq_open_1shot: 25
+    IFEval: 50
     drop: 75
-    GPQA_diamond: 25
+    hellaswag: 81.25
+    TheoremQA: 6.25
+    musr_average: 39.58
+    gsm8k: 68.75
     math: 75
+    GPQA_diamond: 25
+    cmo_fib: 6.25
+    aime2024: 6.25
     wikibench-wiki-single_choice_cncircular: 25
     sanitized_mbpp: 68.75
     ds1000: 13.39
-    gsm8k: 68.75
-    triviaqa_wiki_1shot: 50
-    nq_open_1shot: 25
-    hellaswag: 81.25
-    TheoremQA: 6.25
-    college: 0
-    college_knowledge: 87.5
+    lcb_code_generation: 12.5
+    lcb_code_execution: 43.75
+    lcb_test_output: 12.5
     bbh-logical_deduction_seven_objects: 56.25
     bbh-multistep_arithmetic_two: 68.75
     mmlu-other: 74.04
@@ -81,6 +94,9 @@ internlm2_5-7b-chat-turbomind_fullbench:
     ds1000_Sklearn: 18.75
     ds1000_Pytorch: 6.25
     ds1000_Matplotlib: 37.5
+    openai_mmmlu_lite_AR-XY: 37.5
+    college: 0
+    college_knowledge: 87.5
     Alignbench总分: 0.64
     Alignbench专业能力: 7.6
     AlpacaEvaltotal: 10
@@ -121,6 +137,8 @@ internlm2_5-7b-hf_fullbench:
     winogrande: 75
     hellaswag: 93.75
     TheoremQA: 25
+    dingo_en_192: 37.5
+    dingo_zh_170: 100
     college: 12.5
     college_knowledge: 87.5
     bbh-logical_deduction_seven_objects: 43.75
@@ -144,6 +162,8 @@ internlm2_5-7b-turbomind_fullbench:
     winogrande: 87.5
     hellaswag: 93.75
     TheoremQA: 31.25
+    dingo_en_192: 43.75
+    dingo_zh_170: 100
     college: 12.5
     college_knowledge: 87.5
     bbh-logical_deduction_seven_objects: 50
diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml
index f93f8957..68f6660a 100644
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@@ -43,11 +43,11 @@ gemma-7b-it-hf:
     race-high: 68.75
 
 gemma-2-9b-it-turbomind:
-    gsm8k: 68.75
+    gsm8k: 65.62
     race-high: 84.38
 
 gemma-7b-it-vllm:
-    gsm8k: 28.12
+    gsm8k: 34.38
     race-high: 68.75
 
 internlm2_5-7b-chat-hf:
@@ -95,7 +95,7 @@ llama-3_1-8b-instruct-turbomind:
     race-high: 90.62
 
 llama-3_2-3b-instruct-turbomind:
-    gsm8k: 65.62
+    gsm8k: 62.50
     race-high: 81.25
 
 llama-3-8b-instruct-turbomind:
@@ -112,15 +112,15 @@ mistral-7b-instruct-v0.3-hf:
 
 mistral-nemo-instruct-2407-hf:
     gsm8k: 75
-    race-high: 84.38
+    race-high: 81.25
 
 mistral-nemo-instruct-2407-turbomind:
-    gsm8k: 75
-    race-high: 84.38
+    gsm8k: 68.75
+    race-high: 87.50
 
 mistral-7b-instruct-v0.1-vllm:
-    gsm8k: 37.5
-    race-high: 71.88
+    gsm8k: 34.38
+    race-high: 68.75
 
 mistral-7b-instruct-v0.2-vllm:
     gsm8k: 43.75
@@ -255,13 +255,13 @@ gemma-7b-hf:
     winogrande: 78.12
 
 gemma-2b-vllm:
-    gsm8k: 18.75
+    gsm8k: 15.62
     GPQA_diamond: 6.25
     race-high:
     winogrande:
 
 gemma-7b-vllm:
-    gsm8k: 59.38
+    gsm8k: 53.12
     GPQA_diamond: 6.25
     race-high:
     winogrande:
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 125aaa71..1d7a1189 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -163,9 +163,9 @@ jobs:
             pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
             pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
             pip uninstall torch torchvision torchaudio -y
-            pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
-            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
+            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
             conda info --envs
             pip list
       - name: Prepare - create conda env and install torch - cu12
@@ -183,9 +183,9 @@ jobs:
             pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
             pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
             pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
-            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
+            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
             conda info --envs
             pip list
       - name: Prepare - reinstall lmdeploy - cu12