[CI] update daily test scores (#1870)

* update * Update daily-run-test.yml * Update dlc.py
2025-05-30 16:03:24 +08:00 · 2025-02-20 14:08:18 +08:00 · 2025-02-20 14:08:18 +08:00 · bc22749fd8
commit bc22749fd8
parent f407930475
4 changed files with 21 additions and 20 deletions
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind:
        openai_mmmlu_lite_DE-DE_accuracy: 51.27
        openai_mmmlu_lite_ES-LA_accuracy: 56.94
        openai_mmmlu_lite_FR-FR_accuracy: 58.22
-        openai_mmmlu_lite_HI-IN_accuracy: 33.75
+        openai_mmmlu_lite_HI-IN_accuracy: 30.75
        openai_mmmlu_lite_ID-ID_accuracy: 50.6
        openai_mmmlu_lite_IT-IT_accuracy: 50.6
        openai_mmmlu_lite_JA-JP_accuracy: 51.13
@ -394,7 +394,7 @@ internlm2_5-7b-chat-turbomind:
        CompassArena_naive_average: 34.61
        FoFo_naive_average: 0.38
        mtbench101_avg: 8.01
-        wildbench_average: -15.69
+        wildbench_average: -10.49
        simpleqa_accuracy_given_attempted: 0.04
        chinese_simpleqa_given_attempted_accuracy: 0.34
        alignment_bench_v1_1_专业能力: 6.05
@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind:
        compassarena_knowledge_naive_average: 36
        compassarena_reason_v2_naive_average: 35
        compassarena_math_v2_naive_average: 19.91
-        compassarena_creationv2_zh_naive_average: 35.81
+        compassarena_creationv2_zh_naive_average: 43.64
        fofo_test_prompts_overall: 0.35
        fofo_test_prompts_cn_overall: 0.41
        followbench_llmeval_en_HSR_AVG: 0.73
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@ -15,13 +15,13 @@ chat:
        gsm8k_accuracy: 50
        race-high_accuracy: 68.75
    deepseek-7b-chat-vllm:
-        gsm8k_accuracy: 43.75
-        race-high_accuracy: 75
+        gsm8k_accuracy: 50
+        race-high_accuracy: 78.12
    gemma2-2b-it-hf:
        gsm8k_accuracy: 50
-        race-high_accuracy: 71.88
+        race-high_accuracy: 75
    gemma2-9b-it-hf:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 68.75
        race-high_accuracy: 84.38
    gemma-2b-it-hf:
        gsm8k_accuracy: 3.12
@ -36,7 +36,7 @@ chat:
        gsm8k_accuracy: 78.12
        race-high_accuracy: 93.75
    gemma-7b-it-vllm:
-        gsm8k_accuracy: 34.38
+        gsm8k_accuracy: 46.88
        race-high_accuracy: 68.75
    internlm2_5-7b-chat-hf:
        gsm8k_accuracy: 84.38
@ -57,7 +57,7 @@ chat:
        gsm8k_accuracy: 53.12
        race-high_accuracy: 90.62
    internlm2-chat-7b-vllm:
-        gsm8k_accuracy: 56.25
+        gsm8k_accuracy: 43.75
        race-high_accuracy: 84.38
    llama-3_1-8b-instruct-hf:
        gsm8k_accuracy: 84.38
@ -90,13 +90,13 @@ chat:
        gsm8k_accuracy: 75
        race-high_accuracy: 81.25
    mistral-nemo-instruct-2407-turbomind:
-        gsm8k_accuracy: 65.62
-        race-high_accuracy: 87.50
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 78.12
    mistral-7b-instruct-v0.1-vllm:
        gsm8k_accuracy: 34.38
        race-high_accuracy: 68.75
    mistral-7b-instruct-v0.2-vllm:
-        gsm8k_accuracy: 43.75
+        gsm8k_accuracy: 31.25
        race-high_accuracy: 75
    phi-3-mini-4k-instruct-hf:
        gsm8k_accuracy: 81.25
@ -177,7 +177,7 @@ chat:
        gsm8k_accuracy: 93.75
        race-high_accuracy: 87.5
    mixtral-8x7b-instruct-v0.1-hf:
-        gsm8k_accuracy: 56.25
+        gsm8k_accuracy: 59.38
        race-high_accuracy: 81.25
    mixtral-large-instruct-2411-turbomind:
        gsm8k_accuracy: 90.62
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -17,7 +17,7 @@ on:
        required: false
        description: 'whether to build lmdeploy'
        type:  boolean
-        default: false
+        default: true
      repo_org_lmdeploy:
        required: false
        description: 'Tested repository organization name. Default is internlm/lmdeploy'
@ -162,15 +162,16 @@ jobs:
            pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
            cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
      - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{inputs.build_lmdeploy}}
+        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}-py310
      - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{inputs.build_lmdeploy}}
+        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
+          pip uninstall -y lmdeploy
          pip install lmdeploy-*.whl --no-deps
      - name: conda env
        run: |
@ -188,7 +189,7 @@ jobs:
        regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
    runs-on: volc_cu12_daily
    environment: 'prod'
-    timeout-minutes: 120 #2hours
+    timeout-minutes: 180 #3hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
@ -275,7 +276,7 @@ jobs:
          conda info --envs
          lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
          echo "restful_pid=$!" >> "$GITHUB_ENV"
-          sleep 120s
+          sleep 180s
          opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
          python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@ -334,7 +335,7 @@ jobs:


  notify_to_feishu:
-    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
+    if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
    needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
    timeout-minutes: 5
    runs-on: self-hosted
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@ -286,7 +286,7 @@ class DLCRunner(BaseRunner):
                            f'Failed to get job info for {job_id}')

                    status = job_info['Status']
-                    if status == 'Failed':
+                    if status == 'Failed' or status == 'Stopped':
                        return -1
                    elif status == 'Succeeded':
                        return 0