From bc22749fd8c20d4f69c2c4ebb9e517bce2c4666a Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Thu, 20 Feb 2025 14:08:18 +0800 Subject: [PATCH] [CI] update daily test scores (#1870) * update * Update daily-run-test.yml * Update dlc.py --- .../scripts/oc_score_baseline_fullbench.yaml | 6 +++--- .../scripts/oc_score_baseline_testrange.yaml | 20 +++++++++---------- .github/workflows/daily-run-test.yml | 13 ++++++------ opencompass/runners/dlc.py | 2 +- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 6ab32832..5b0dee2b 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind: openai_mmmlu_lite_DE-DE_accuracy: 51.27 openai_mmmlu_lite_ES-LA_accuracy: 56.94 openai_mmmlu_lite_FR-FR_accuracy: 58.22 - openai_mmmlu_lite_HI-IN_accuracy: 33.75 + openai_mmmlu_lite_HI-IN_accuracy: 30.75 openai_mmmlu_lite_ID-ID_accuracy: 50.6 openai_mmmlu_lite_IT-IT_accuracy: 50.6 openai_mmmlu_lite_JA-JP_accuracy: 51.13 @@ -394,7 +394,7 @@ internlm2_5-7b-chat-turbomind: CompassArena_naive_average: 34.61 FoFo_naive_average: 0.38 mtbench101_avg: 8.01 - wildbench_average: -15.69 + wildbench_average: -10.49 simpleqa_accuracy_given_attempted: 0.04 chinese_simpleqa_given_attempted_accuracy: 0.34 alignment_bench_v1_1_专业能力: 6.05 @@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind: compassarena_knowledge_naive_average: 36 compassarena_reason_v2_naive_average: 35 compassarena_math_v2_naive_average: 19.91 - compassarena_creationv2_zh_naive_average: 35.81 + compassarena_creationv2_zh_naive_average: 43.64 fofo_test_prompts_overall: 0.35 fofo_test_prompts_cn_overall: 0.41 followbench_llmeval_en_HSR_AVG: 0.73 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index 45e20ddd..5f1121a7 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -15,13 +15,13 @@ chat: gsm8k_accuracy: 50 race-high_accuracy: 68.75 deepseek-7b-chat-vllm: - gsm8k_accuracy: 43.75 - race-high_accuracy: 75 + gsm8k_accuracy: 50 + race-high_accuracy: 78.12 gemma2-2b-it-hf: gsm8k_accuracy: 50 - race-high_accuracy: 71.88 + race-high_accuracy: 75 gemma2-9b-it-hf: - gsm8k_accuracy: 71.88 + gsm8k_accuracy: 68.75 race-high_accuracy: 84.38 gemma-2b-it-hf: gsm8k_accuracy: 3.12 @@ -36,7 +36,7 @@ chat: gsm8k_accuracy: 78.12 race-high_accuracy: 93.75 gemma-7b-it-vllm: - gsm8k_accuracy: 34.38 + gsm8k_accuracy: 46.88 race-high_accuracy: 68.75 internlm2_5-7b-chat-hf: gsm8k_accuracy: 84.38 @@ -57,7 +57,7 @@ chat: gsm8k_accuracy: 53.12 race-high_accuracy: 90.62 internlm2-chat-7b-vllm: - gsm8k_accuracy: 56.25 + gsm8k_accuracy: 43.75 race-high_accuracy: 84.38 llama-3_1-8b-instruct-hf: gsm8k_accuracy: 84.38 @@ -90,13 +90,13 @@ chat: gsm8k_accuracy: 75 race-high_accuracy: 81.25 mistral-nemo-instruct-2407-turbomind: - gsm8k_accuracy: 65.62 - race-high_accuracy: 87.50 + gsm8k_accuracy: 71.88 + race-high_accuracy: 78.12 mistral-7b-instruct-v0.1-vllm: gsm8k_accuracy: 34.38 race-high_accuracy: 68.75 mistral-7b-instruct-v0.2-vllm: - gsm8k_accuracy: 43.75 + gsm8k_accuracy: 31.25 race-high_accuracy: 75 phi-3-mini-4k-instruct-hf: gsm8k_accuracy: 81.25 @@ -177,7 +177,7 @@ chat: gsm8k_accuracy: 93.75 race-high_accuracy: 87.5 mixtral-8x7b-instruct-v0.1-hf: - gsm8k_accuracy: 56.25 + gsm8k_accuracy: 59.38 race-high_accuracy: 81.25 mixtral-large-instruct-2411-turbomind: gsm8k_accuracy: 90.62 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 0fa1f4a6..8aa1df16 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -17,7 +17,7 @@ on: required: false description: 'whether to build lmdeploy' type: boolean - default: false + default: true repo_org_lmdeploy: required: false description: 'Tested repository organization name. Default is internlm/lmdeploy' @@ -162,15 +162,16 @@ jobs: pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}} cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data - name: Prepare - reinstall lmdeploy - cu12 - if: ${{inputs.build_lmdeploy}} + if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}} uses: actions/download-artifact@v4 with: name: my-artifact-${{ github.run_id }}-py310 - name: Prepare - reinstall lmdeploy - cu12 - if: ${{inputs.build_lmdeploy}} + if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}} run: | . ${{env.CONDA_PATH}}/bin/activate conda activate ${{env.CONDA_ENV}} + pip uninstall -y lmdeploy pip install lmdeploy-*.whl --no-deps - name: conda env run: | @@ -188,7 +189,7 @@ jobs: regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}} runs-on: volc_cu12_daily environment: 'prod' - timeout-minutes: 120 #2hours + timeout-minutes: 180 #3hours steps: - name: Clone repository uses: actions/checkout@v2 @@ -275,7 +276,7 @@ jobs: conda info --envs lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 & echo "restful_pid=$!" >> "$GITHUB_ENV" - sleep 120s + sleep 180s opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py @@ -334,7 +335,7 @@ jobs: notify_to_feishu: - if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} + if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test] timeout-minutes: 5 runs-on: self-hosted diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 03045870..44e9fd00 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -286,7 +286,7 @@ class DLCRunner(BaseRunner): f'Failed to get job info for {job_id}') status = job_info['Status'] - if status == 'Failed': + if status == 'Failed' or status == 'Stopped': return -1 elif status == 'Succeeded': return 0