mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[CI] update dailytest sceduler and baseline's score(#1898)
This commit is contained in:
parent
bdb2d46f59
commit
6042b88e58
2
.github/scripts/eval_regression_api.py
vendored
2
.github/scripts/eval_regression_api.py
vendored
@ -24,7 +24,7 @@ models = [
|
|||||||
abbr='lmdeploy-api-test',
|
abbr='lmdeploy-api-test',
|
||||||
type=OpenAISDK,
|
type=OpenAISDK,
|
||||||
key='EMPTY',
|
key='EMPTY',
|
||||||
openai_api_base='http://localhost:23333/v1',
|
openai_api_base='http://0.0.0.0:23333/v1',
|
||||||
path='internlm2',
|
path='internlm2',
|
||||||
tokenizer_path='internlm/internlm2_5-7b-chat',
|
tokenizer_path='internlm/internlm2_5-7b-chat',
|
||||||
rpm_verbose=True,
|
rpm_verbose=True,
|
||||||
|
@ -42,7 +42,7 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
alpaca_eval_total: 20
|
alpaca_eval_total: 20
|
||||||
arenahard_score: 50
|
arenahard_score: 50
|
||||||
Followbench_naive_average: 1
|
Followbench_naive_average: 1
|
||||||
CompassArena_naive_average: 44.00
|
CompassArena_naive_average: 43
|
||||||
mtbench101_avg: 7.8
|
mtbench101_avg: 7.8
|
||||||
wildbench_average: -12.78
|
wildbench_average: -12.78
|
||||||
simpleqa_accuracy_given_attempted: 0
|
simpleqa_accuracy_given_attempted: 0
|
||||||
@ -58,7 +58,7 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
alpaca_eval_helpful_base: 20
|
alpaca_eval_helpful_base: 20
|
||||||
compassarena_language_naive_average: 35
|
compassarena_language_naive_average: 35
|
||||||
compassarena_knowledge_naive_average: 55
|
compassarena_knowledge_naive_average: 55
|
||||||
compassarena_reason_v2_naive_average: 45.00
|
compassarena_reason_v2_naive_average: 40
|
||||||
compassarena_math_v2_naive_average: 55
|
compassarena_math_v2_naive_average: 55
|
||||||
compassarena_creationv2_zh_naive_average: 30
|
compassarena_creationv2_zh_naive_average: 30
|
||||||
followbench_llmeval_en_HSR_AVG: 1
|
followbench_llmeval_en_HSR_AVG: 1
|
||||||
|
32
.github/scripts/oc_score_baseline_testrange.yaml
vendored
32
.github/scripts/oc_score_baseline_testrange.yaml
vendored
@ -6,7 +6,7 @@ chat:
|
|||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
glm-4-9b-chat-vllm:
|
glm-4-9b-chat-vllm:
|
||||||
gsm8k_accuracy: 65.62
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
deepseek-7b-chat-hf:
|
deepseek-7b-chat-hf:
|
||||||
gsm8k_accuracy: 46.88
|
gsm8k_accuracy: 46.88
|
||||||
@ -63,7 +63,7 @@ chat:
|
|||||||
gsm8k_accuracy: 84.38
|
gsm8k_accuracy: 84.38
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
llama-3_2-3b-instruct-hf:
|
llama-3_2-3b-instruct-hf:
|
||||||
gsm8k_accuracy: 65.62
|
gsm8k_accuracy: 68.75
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
llama-3-8b-instruct-hf:
|
llama-3-8b-instruct-hf:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 68.75
|
||||||
@ -75,7 +75,7 @@ chat:
|
|||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 78.12
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
llama-3_2-3b-instruct-turbomind:
|
llama-3_2-3b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 62.50
|
gsm8k_accuracy: 65.62
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
llama-3-8b-instruct-turbomind:
|
llama-3-8b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 71.88
|
||||||
@ -226,25 +226,25 @@ base:
|
|||||||
race-high_accuracy: 25
|
race-high_accuracy: 25
|
||||||
winogrande_accuracy: 68.75
|
winogrande_accuracy: 68.75
|
||||||
gemma2-2b-hf:
|
gemma2-2b-hf:
|
||||||
gsm8k_accuracy: 28.12
|
gsm8k_accuracy: 31.25
|
||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 3.12
|
||||||
race-high_accuracy: 56.25
|
race-high_accuracy: 56.25
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 75.00
|
||||||
gemma2-9b-hf:
|
gemma2-9b-hf:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 75.00
|
||||||
GPQA_diamond_accuracy: 0
|
GPQA_diamond_accuracy: 0
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 84.38
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 81.25
|
||||||
gemma-2b-hf:
|
gemma-2b-hf:
|
||||||
gsm8k_accuracy: 18.75
|
gsm8k_accuracy: 21.88
|
||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 3.12
|
||||||
race-high_accuracy: 25
|
race-high_accuracy: 21.88
|
||||||
winogrande_accuracy: 53.12
|
winogrande_accuracy: 53.12
|
||||||
gemma-7b-hf:
|
gemma-7b-hf:
|
||||||
gsm8k_accuracy: 56.25
|
gsm8k_accuracy: 56.25
|
||||||
GPQA_diamond_accuracy: 6.25
|
GPQA_diamond_accuracy: 3.12
|
||||||
race-high_accuracy: 65.62
|
race-high_accuracy: 65.62
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 71.88
|
||||||
gemma-2b-vllm:
|
gemma-2b-vllm:
|
||||||
gsm8k_accuracy: 15.62
|
gsm8k_accuracy: 15.62
|
||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 3.12
|
||||||
@ -441,10 +441,10 @@ base:
|
|||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 87.5
|
winogrande_accuracy: 87.5
|
||||||
deepseek-v2-turbomind:
|
deepseek-v2-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 65.62
|
||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 15.62
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 75
|
winogrande_accuracy: 84.38
|
||||||
llama-3-70b-hf:
|
llama-3-70b-hf:
|
||||||
gsm8k_accuracy: 62.5
|
gsm8k_accuracy: 62.5
|
||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 3.12
|
||||||
|
16
.github/workflows/daily-run-test.yml
vendored
16
.github/workflows/daily-run-test.yml
vendored
@ -44,7 +44,7 @@ on:
|
|||||||
type: string
|
type: string
|
||||||
default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
|
default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '15 14 * * *'
|
- cron: '15 14 * * 0,2'
|
||||||
|
|
||||||
env:
|
env:
|
||||||
HF_DATASETS_OFFLINE: 1
|
HF_DATASETS_OFFLINE: 1
|
||||||
@ -87,7 +87,7 @@ jobs:
|
|||||||
name: my-artifact-${{ github.run_id }}
|
name: my-artifact-${{ github.run_id }}
|
||||||
|
|
||||||
build-pypi-lmdeploy:
|
build-pypi-lmdeploy:
|
||||||
if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}}
|
if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.build_lmdeploy)}}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
pyver: [py310]
|
pyver: [py310]
|
||||||
@ -127,7 +127,7 @@ jobs:
|
|||||||
needs: ['build-pypi', 'build-pypi-lmdeploy']
|
needs: ['build-pypi', 'build-pypi-lmdeploy']
|
||||||
runs-on: volc_cu12
|
runs-on: volc_cu12
|
||||||
environment: 'prod'
|
environment: 'prod'
|
||||||
timeout-minutes: 240 #4hours
|
timeout-minutes: 120 #2hours
|
||||||
steps:
|
steps:
|
||||||
- name: Clone repository
|
- name: Clone repository
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v2
|
||||||
@ -148,7 +148,7 @@ jobs:
|
|||||||
uses: nick-fields/retry@v3
|
uses: nick-fields/retry@v3
|
||||||
with:
|
with:
|
||||||
max_attempts: 1
|
max_attempts: 1
|
||||||
timeout_minutes: 240
|
timeout_minutes: 120
|
||||||
command: |
|
command: |
|
||||||
. ${{env.CONDA_PATH}}/bin/activate
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
conda create -y --name ${{env.CONDA_ENV}} python=3.10
|
conda create -y --name ${{env.CONDA_ENV}} python=3.10
|
||||||
@ -211,7 +211,7 @@ jobs:
|
|||||||
uses: nick-fields/retry@v3
|
uses: nick-fields/retry@v3
|
||||||
with:
|
with:
|
||||||
max_attempts: 1
|
max_attempts: 1
|
||||||
timeout_minutes: 120
|
timeout_minutes: 180
|
||||||
command: |
|
command: |
|
||||||
. ${{env.CONDA_PATH}}/bin/activate
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
conda activate ${{env.CONDA_ENV}}
|
conda activate ${{env.CONDA_ENV}}
|
||||||
@ -230,7 +230,7 @@ jobs:
|
|||||||
regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
|
regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
|
||||||
runs-on: volc_cu12_local
|
runs-on: volc_cu12_local
|
||||||
environment: 'prod'
|
environment: 'prod'
|
||||||
timeout-minutes: 240 #4hours
|
timeout-minutes: 480 #6hours
|
||||||
steps:
|
steps:
|
||||||
- name: Clone repository
|
- name: Clone repository
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v2
|
||||||
@ -306,7 +306,7 @@ jobs:
|
|||||||
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
|
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
|
||||||
runs-on: volc_cu12
|
runs-on: volc_cu12
|
||||||
environment: 'prod'
|
environment: 'prod'
|
||||||
timeout-minutes: 360 #6hours
|
timeout-minutes: 480 #6hours
|
||||||
steps:
|
steps:
|
||||||
- name: Clone repository
|
- name: Clone repository
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v2
|
||||||
@ -323,7 +323,7 @@ jobs:
|
|||||||
uses: nick-fields/retry@v3
|
uses: nick-fields/retry@v3
|
||||||
with:
|
with:
|
||||||
max_attempts: 1
|
max_attempts: 1
|
||||||
timeout_minutes: 360
|
timeout_minutes: 480
|
||||||
command: |
|
command: |
|
||||||
. ${{env.CONDA_PATH}}/bin/activate
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
conda activate ${{env.CONDA_ENV}}
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
Loading…
Reference in New Issue
Block a user