From bc22749fd8c20d4f69c2c4ebb9e517bce2c4666a Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Thu, 20 Feb 2025 14:08:18 +0800
Subject: [PATCH] [CI] update daily test scores (#1870)

* update

* Update daily-run-test.yml

* Update dlc.py
---
 .../scripts/oc_score_baseline_fullbench.yaml  |  6 +++---
 .../scripts/oc_score_baseline_testrange.yaml  | 20 +++++++++----------
 .github/workflows/daily-run-test.yml          | 13 ++++++------
 opencompass/runners/dlc.py                    |  2 +-
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml
index 6ab32832..5b0dee2b 100644
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind:
         openai_mmmlu_lite_DE-DE_accuracy: 51.27
         openai_mmmlu_lite_ES-LA_accuracy: 56.94
         openai_mmmlu_lite_FR-FR_accuracy: 58.22
-        openai_mmmlu_lite_HI-IN_accuracy: 33.75
+        openai_mmmlu_lite_HI-IN_accuracy: 30.75
         openai_mmmlu_lite_ID-ID_accuracy: 50.6
         openai_mmmlu_lite_IT-IT_accuracy: 50.6
         openai_mmmlu_lite_JA-JP_accuracy: 51.13
@@ -394,7 +394,7 @@ internlm2_5-7b-chat-turbomind:
         CompassArena_naive_average: 34.61
         FoFo_naive_average: 0.38
         mtbench101_avg: 8.01
-        wildbench_average: -15.69
+        wildbench_average: -10.49
         simpleqa_accuracy_given_attempted: 0.04
         chinese_simpleqa_given_attempted_accuracy: 0.34
         alignment_bench_v1_1_专业能力: 6.05
@@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind:
         compassarena_knowledge_naive_average: 36
         compassarena_reason_v2_naive_average: 35
         compassarena_math_v2_naive_average: 19.91
-        compassarena_creationv2_zh_naive_average: 35.81
+        compassarena_creationv2_zh_naive_average: 43.64
         fofo_test_prompts_overall: 0.35
         fofo_test_prompts_cn_overall: 0.41
         followbench_llmeval_en_HSR_AVG: 0.73
diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml
index 45e20ddd..5f1121a7 100644
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@@ -15,13 +15,13 @@ chat:
         gsm8k_accuracy: 50
         race-high_accuracy: 68.75
     deepseek-7b-chat-vllm:
-        gsm8k_accuracy: 43.75
-        race-high_accuracy: 75
+        gsm8k_accuracy: 50
+        race-high_accuracy: 78.12
     gemma2-2b-it-hf:
         gsm8k_accuracy: 50
-        race-high_accuracy: 71.88
+        race-high_accuracy: 75
     gemma2-9b-it-hf:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 68.75
         race-high_accuracy: 84.38
     gemma-2b-it-hf:
         gsm8k_accuracy: 3.12
@@ -36,7 +36,7 @@ chat:
         gsm8k_accuracy: 78.12
         race-high_accuracy: 93.75
     gemma-7b-it-vllm:
-        gsm8k_accuracy: 34.38
+        gsm8k_accuracy: 46.88
         race-high_accuracy: 68.75
     internlm2_5-7b-chat-hf:
         gsm8k_accuracy: 84.38
@@ -57,7 +57,7 @@ chat:
         gsm8k_accuracy: 53.12
         race-high_accuracy: 90.62
     internlm2-chat-7b-vllm:
-        gsm8k_accuracy: 56.25
+        gsm8k_accuracy: 43.75
         race-high_accuracy: 84.38
     llama-3_1-8b-instruct-hf:
         gsm8k_accuracy: 84.38
@@ -90,13 +90,13 @@ chat:
         gsm8k_accuracy: 75
         race-high_accuracy: 81.25
     mistral-nemo-instruct-2407-turbomind:
-        gsm8k_accuracy: 65.62
-        race-high_accuracy: 87.50
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 78.12
     mistral-7b-instruct-v0.1-vllm:
         gsm8k_accuracy: 34.38
         race-high_accuracy: 68.75
     mistral-7b-instruct-v0.2-vllm:
-        gsm8k_accuracy: 43.75
+        gsm8k_accuracy: 31.25
         race-high_accuracy: 75
     phi-3-mini-4k-instruct-hf:
         gsm8k_accuracy: 81.25
@@ -177,7 +177,7 @@ chat:
         gsm8k_accuracy: 93.75
         race-high_accuracy: 87.5
     mixtral-8x7b-instruct-v0.1-hf:
-        gsm8k_accuracy: 56.25
+        gsm8k_accuracy: 59.38
         race-high_accuracy: 81.25
     mixtral-large-instruct-2411-turbomind:
         gsm8k_accuracy: 90.62
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 0fa1f4a6..8aa1df16 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -17,7 +17,7 @@ on:
         required: false
         description: 'whether to build lmdeploy'
         type:  boolean
-        default: false
+        default: true
       repo_org_lmdeploy:
         required: false
         description: 'Tested repository organization name. Default is internlm/lmdeploy'
@@ -162,15 +162,16 @@ jobs:
             pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
             cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
       - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{inputs.build_lmdeploy}}
+        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
         uses: actions/download-artifact@v4
         with:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{inputs.build_lmdeploy}}
+        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
         run: |
           . ${{env.CONDA_PATH}}/bin/activate
           conda activate ${{env.CONDA_ENV}}
+          pip uninstall -y lmdeploy
           pip install lmdeploy-*.whl --no-deps
       - name: conda env
         run: |
@@ -188,7 +189,7 @@ jobs:
         regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
     runs-on: volc_cu12_daily
     environment: 'prod'
-    timeout-minutes: 120 #2hours
+    timeout-minutes: 180 #3hours
     steps:
       - name: Clone repository
         uses: actions/checkout@v2
@@ -275,7 +276,7 @@ jobs:
           conda info --envs
           lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
           echo "restful_pid=$!" >> "$GITHUB_ENV"
-          sleep 120s
+          sleep 180s
           opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
           python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@@ -334,7 +335,7 @@ jobs:
 
 
   notify_to_feishu:
-    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
+    if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
     needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
     timeout-minutes: 5
     runs-on: self-hosted
diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py
index 03045870..44e9fd00 100644
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@@ -286,7 +286,7 @@ class DLCRunner(BaseRunner):
                             f'Failed to get job info for {job_id}')
 
                     status = job_info['Status']
-                    if status == 'Failed':
+                    if status == 'Failed' or status == 'Stopped':
                         return -1
                     elif status == 'Succeeded':
                         return 0