[CI] update daily test scores (#1870)

* update

* Update daily-run-test.yml

* Update dlc.py
This commit is contained in:
zhulinJulia24 2025-02-20 14:08:18 +08:00 committed by GitHub
parent f407930475
commit bc22749fd8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 21 additions and 20 deletions

View File

@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind:
openai_mmmlu_lite_DE-DE_accuracy: 51.27
openai_mmmlu_lite_ES-LA_accuracy: 56.94
openai_mmmlu_lite_FR-FR_accuracy: 58.22
openai_mmmlu_lite_HI-IN_accuracy: 33.75
openai_mmmlu_lite_HI-IN_accuracy: 30.75
openai_mmmlu_lite_ID-ID_accuracy: 50.6
openai_mmmlu_lite_IT-IT_accuracy: 50.6
openai_mmmlu_lite_JA-JP_accuracy: 51.13
@ -394,7 +394,7 @@ internlm2_5-7b-chat-turbomind:
CompassArena_naive_average: 34.61
FoFo_naive_average: 0.38
mtbench101_avg: 8.01
wildbench_average: -15.69
wildbench_average: -10.49
simpleqa_accuracy_given_attempted: 0.04
chinese_simpleqa_given_attempted_accuracy: 0.34
alignment_bench_v1_1_专业能力: 6.05
@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind:
compassarena_knowledge_naive_average: 36
compassarena_reason_v2_naive_average: 35
compassarena_math_v2_naive_average: 19.91
compassarena_creationv2_zh_naive_average: 35.81
compassarena_creationv2_zh_naive_average: 43.64
fofo_test_prompts_overall: 0.35
fofo_test_prompts_cn_overall: 0.41
followbench_llmeval_en_HSR_AVG: 0.73

View File

@ -15,13 +15,13 @@ chat:
gsm8k_accuracy: 50
race-high_accuracy: 68.75
deepseek-7b-chat-vllm:
gsm8k_accuracy: 43.75
race-high_accuracy: 75
gsm8k_accuracy: 50
race-high_accuracy: 78.12
gemma2-2b-it-hf:
gsm8k_accuracy: 50
race-high_accuracy: 71.88
race-high_accuracy: 75
gemma2-9b-it-hf:
gsm8k_accuracy: 71.88
gsm8k_accuracy: 68.75
race-high_accuracy: 84.38
gemma-2b-it-hf:
gsm8k_accuracy: 3.12
@ -36,7 +36,7 @@ chat:
gsm8k_accuracy: 78.12
race-high_accuracy: 93.75
gemma-7b-it-vllm:
gsm8k_accuracy: 34.38
gsm8k_accuracy: 46.88
race-high_accuracy: 68.75
internlm2_5-7b-chat-hf:
gsm8k_accuracy: 84.38
@ -57,7 +57,7 @@ chat:
gsm8k_accuracy: 53.12
race-high_accuracy: 90.62
internlm2-chat-7b-vllm:
gsm8k_accuracy: 56.25
gsm8k_accuracy: 43.75
race-high_accuracy: 84.38
llama-3_1-8b-instruct-hf:
gsm8k_accuracy: 84.38
@ -90,13 +90,13 @@ chat:
gsm8k_accuracy: 75
race-high_accuracy: 81.25
mistral-nemo-instruct-2407-turbomind:
gsm8k_accuracy: 65.62
race-high_accuracy: 87.50
gsm8k_accuracy: 71.88
race-high_accuracy: 78.12
mistral-7b-instruct-v0.1-vllm:
gsm8k_accuracy: 34.38
race-high_accuracy: 68.75
mistral-7b-instruct-v0.2-vllm:
gsm8k_accuracy: 43.75
gsm8k_accuracy: 31.25
race-high_accuracy: 75
phi-3-mini-4k-instruct-hf:
gsm8k_accuracy: 81.25
@ -177,7 +177,7 @@ chat:
gsm8k_accuracy: 93.75
race-high_accuracy: 87.5
mixtral-8x7b-instruct-v0.1-hf:
gsm8k_accuracy: 56.25
gsm8k_accuracy: 59.38
race-high_accuracy: 81.25
mixtral-large-instruct-2411-turbomind:
gsm8k_accuracy: 90.62

View File

@ -17,7 +17,7 @@ on:
required: false
description: 'whether to build lmdeploy'
type: boolean
default: false
default: true
repo_org_lmdeploy:
required: false
description: 'Tested repository organization name. Default is internlm/lmdeploy'
@ -162,15 +162,16 @@ jobs:
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
- name: Prepare - reinstall lmdeploy - cu12
if: ${{inputs.build_lmdeploy}}
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}-py310
- name: Prepare - reinstall lmdeploy - cu12
if: ${{inputs.build_lmdeploy}}
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip uninstall -y lmdeploy
pip install lmdeploy-*.whl --no-deps
- name: conda env
run: |
@ -188,7 +189,7 @@ jobs:
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
runs-on: volc_cu12_daily
environment: 'prod'
timeout-minutes: 120 #2hours
timeout-minutes: 180 #3hours
steps:
- name: Clone repository
uses: actions/checkout@v2
@ -275,7 +276,7 @@ jobs:
conda info --envs
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 120s
sleep 180s
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@ -334,7 +335,7 @@ jobs:
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
timeout-minutes: 5
runs-on: self-hosted

View File

@ -286,7 +286,7 @@ class DLCRunner(BaseRunner):
f'Failed to get job info for {job_id}')
status = job_info['Status']
if status == 'Failed':
if status == 'Failed' or status == 'Stopped':
return -1
elif status == 'Succeeded':
return 0