mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[CI] update daily test scores (#1870)
* update * Update daily-run-test.yml * Update dlc.py
This commit is contained in:
parent
f407930475
commit
bc22749fd8
@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 51.27
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 56.94
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 58.22
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 33.75
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 30.75
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 50.6
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 50.6
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 51.13
|
||||
@ -394,7 +394,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
CompassArena_naive_average: 34.61
|
||||
FoFo_naive_average: 0.38
|
||||
mtbench101_avg: 8.01
|
||||
wildbench_average: -15.69
|
||||
wildbench_average: -10.49
|
||||
simpleqa_accuracy_given_attempted: 0.04
|
||||
chinese_simpleqa_given_attempted_accuracy: 0.34
|
||||
alignment_bench_v1_1_专业能力: 6.05
|
||||
@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
compassarena_knowledge_naive_average: 36
|
||||
compassarena_reason_v2_naive_average: 35
|
||||
compassarena_math_v2_naive_average: 19.91
|
||||
compassarena_creationv2_zh_naive_average: 35.81
|
||||
compassarena_creationv2_zh_naive_average: 43.64
|
||||
fofo_test_prompts_overall: 0.35
|
||||
fofo_test_prompts_cn_overall: 0.41
|
||||
followbench_llmeval_en_HSR_AVG: 0.73
|
||||
|
20
.github/scripts/oc_score_baseline_testrange.yaml
vendored
20
.github/scripts/oc_score_baseline_testrange.yaml
vendored
@ -15,13 +15,13 @@ chat:
|
||||
gsm8k_accuracy: 50
|
||||
race-high_accuracy: 68.75
|
||||
deepseek-7b-chat-vllm:
|
||||
gsm8k_accuracy: 43.75
|
||||
race-high_accuracy: 75
|
||||
gsm8k_accuracy: 50
|
||||
race-high_accuracy: 78.12
|
||||
gemma2-2b-it-hf:
|
||||
gsm8k_accuracy: 50
|
||||
race-high_accuracy: 71.88
|
||||
race-high_accuracy: 75
|
||||
gemma2-9b-it-hf:
|
||||
gsm8k_accuracy: 71.88
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 84.38
|
||||
gemma-2b-it-hf:
|
||||
gsm8k_accuracy: 3.12
|
||||
@ -36,7 +36,7 @@ chat:
|
||||
gsm8k_accuracy: 78.12
|
||||
race-high_accuracy: 93.75
|
||||
gemma-7b-it-vllm:
|
||||
gsm8k_accuracy: 34.38
|
||||
gsm8k_accuracy: 46.88
|
||||
race-high_accuracy: 68.75
|
||||
internlm2_5-7b-chat-hf:
|
||||
gsm8k_accuracy: 84.38
|
||||
@ -57,7 +57,7 @@ chat:
|
||||
gsm8k_accuracy: 53.12
|
||||
race-high_accuracy: 90.62
|
||||
internlm2-chat-7b-vllm:
|
||||
gsm8k_accuracy: 56.25
|
||||
gsm8k_accuracy: 43.75
|
||||
race-high_accuracy: 84.38
|
||||
llama-3_1-8b-instruct-hf:
|
||||
gsm8k_accuracy: 84.38
|
||||
@ -90,13 +90,13 @@ chat:
|
||||
gsm8k_accuracy: 75
|
||||
race-high_accuracy: 81.25
|
||||
mistral-nemo-instruct-2407-turbomind:
|
||||
gsm8k_accuracy: 65.62
|
||||
race-high_accuracy: 87.50
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 78.12
|
||||
mistral-7b-instruct-v0.1-vllm:
|
||||
gsm8k_accuracy: 34.38
|
||||
race-high_accuracy: 68.75
|
||||
mistral-7b-instruct-v0.2-vllm:
|
||||
gsm8k_accuracy: 43.75
|
||||
gsm8k_accuracy: 31.25
|
||||
race-high_accuracy: 75
|
||||
phi-3-mini-4k-instruct-hf:
|
||||
gsm8k_accuracy: 81.25
|
||||
@ -177,7 +177,7 @@ chat:
|
||||
gsm8k_accuracy: 93.75
|
||||
race-high_accuracy: 87.5
|
||||
mixtral-8x7b-instruct-v0.1-hf:
|
||||
gsm8k_accuracy: 56.25
|
||||
gsm8k_accuracy: 59.38
|
||||
race-high_accuracy: 81.25
|
||||
mixtral-large-instruct-2411-turbomind:
|
||||
gsm8k_accuracy: 90.62
|
||||
|
13
.github/workflows/daily-run-test.yml
vendored
13
.github/workflows/daily-run-test.yml
vendored
@ -17,7 +17,7 @@ on:
|
||||
required: false
|
||||
description: 'whether to build lmdeploy'
|
||||
type: boolean
|
||||
default: false
|
||||
default: true
|
||||
repo_org_lmdeploy:
|
||||
required: false
|
||||
description: 'Tested repository organization name. Default is internlm/lmdeploy'
|
||||
@ -162,15 +162,16 @@ jobs:
|
||||
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
|
||||
- name: Prepare - reinstall lmdeploy - cu12
|
||||
if: ${{inputs.build_lmdeploy}}
|
||||
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: my-artifact-${{ github.run_id }}-py310
|
||||
- name: Prepare - reinstall lmdeploy - cu12
|
||||
if: ${{inputs.build_lmdeploy}}
|
||||
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
|
||||
run: |
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
pip uninstall -y lmdeploy
|
||||
pip install lmdeploy-*.whl --no-deps
|
||||
- name: conda env
|
||||
run: |
|
||||
@ -188,7 +189,7 @@ jobs:
|
||||
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
|
||||
runs-on: volc_cu12_daily
|
||||
environment: 'prod'
|
||||
timeout-minutes: 120 #2hours
|
||||
timeout-minutes: 180 #3hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
uses: actions/checkout@v2
|
||||
@ -275,7 +276,7 @@ jobs:
|
||||
conda info --envs
|
||||
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
||||
echo "restful_pid=$!" >> "$GITHUB_ENV"
|
||||
sleep 120s
|
||||
sleep 180s
|
||||
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
|
||||
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
@ -334,7 +335,7 @@ jobs:
|
||||
|
||||
|
||||
notify_to_feishu:
|
||||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
||||
if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
||||
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
|
||||
timeout-minutes: 5
|
||||
runs-on: self-hosted
|
||||
|
@ -286,7 +286,7 @@ class DLCRunner(BaseRunner):
|
||||
f'Failed to get job info for {job_id}')
|
||||
|
||||
status = job_info['Status']
|
||||
if status == 'Failed':
|
||||
if status == 'Failed' or status == 'Stopped':
|
||||
return -1
|
||||
elif status == 'Succeeded':
|
||||
return 0
|
||||
|
Loading…
Reference in New Issue
Block a user