[CI] Update daily test metrics threshold (#1812)

* Update daily-run-test.yml

* Update pr-run-test.yml

* update

* update

* update

* updaet

* update

* update

* update

* update

* update

* update

* update

---------

Co-authored-by: MaiziXiao <xxllcc1993@gmail.com>
This commit is contained in:
zhulinJulia24 2025-01-09 18:16:24 +08:00 committed by GitHub
parent e039f3efa0
commit abdcee68f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 227 additions and 145 deletions

View File

@ -37,3 +37,6 @@ models = [
retry=20,
)
]
for d in datasets:
d['reader_cfg']['test_range'] = '[0:16]'

View File

@ -79,12 +79,8 @@ with read_base():
models as lmdeploy_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \
models as lmdeploy_llama3_70b_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
models as hf_mistral_7b_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
models as hf_mistral_7b_v0_3_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
models as vllm_mistral_7b_v0_2_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \
models as hf_qwen_2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \

View File

@ -101,12 +101,8 @@ with read_base():
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm3_4b import \
models as hf_minicpm3_4b_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \

View File

@ -72,7 +72,7 @@ class TestChat:
base_score = baseline_scores_testrange.get('chat').get(model).get(
dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
assert_score(model, result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@ -94,7 +94,7 @@ class TestBase:
base_score = baseline_scores_testrange.get('base').get(model).get(
dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
assert_score(model, result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@ -112,7 +112,7 @@ class TestChatObjFullbench:
base_score = baseline_scores_fullbench.get(model).get('objective').get(
dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
assert_score(model, result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@ -131,7 +131,7 @@ class TestChatSubFullbench:
base_score = baseline_scores_fullbench.get(model).get(
'subjective').get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
assert_score(model, result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@ -150,7 +150,7 @@ class TestBaseFullbench:
base_score = baseline_scores_fullbench.get(model).get('objective').get(
dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
assert_score(model, result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@ -166,7 +166,7 @@ class TestApibench:
def test_api(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
assert_score(model + '_batch', result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@ -185,7 +185,7 @@ class TestVolcFullbench:
base_score = baseline_scores_fullbench.get(model).get('objective').get(
dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
assert_score(model + '_batch', result_score, base_score, dataset)
@pytest.mark.parametrize('model, dataset', [
(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
@ -197,7 +197,7 @@ class TestVolcFullbench:
base_score = baseline_scores_fullbench.get(model).get(
'subjective').get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
assert_score(model + '_batch', result_score, base_score, dataset)
@pytest.mark.parametrize(
'model, dataset',
@ -209,7 +209,7 @@ class TestVolcFullbench:
base_score = baseline_scores_fullbench.get(model).get('objective').get(
dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
assert_score(model + '_batch', result_score, base_score, dataset)
@pytest.mark.parametrize(
'model, dataset',
@ -221,7 +221,7 @@ class TestVolcFullbench:
base_score = baseline_scores_fullbench.get(model).get(
'long_context').get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
assert_score(model + '_batch', result_score, base_score, dataset)
@pytest.mark.parametrize(
'model, dataset',
@ -234,7 +234,7 @@ class TestVolcFullbench:
base_score = baseline_scores_fullbench.get(model).get(
'long_context').get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
assert_score(model + '_batch', result_score, base_score, dataset)
@pytest.mark.usefixtures('result_scores')
@ -252,7 +252,7 @@ class TestCmdCase:
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
assert_score(model, result_score, base_score, dataset)
@pytest.mark.case2
@pytest.mark.parametrize(
@ -266,7 +266,7 @@ class TestCmdCase:
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
assert_score(model + '_batch', result_score, base_score, dataset)
@pytest.mark.case3
@pytest.mark.parametrize('model, dataset',
@ -276,7 +276,7 @@ class TestCmdCase:
def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
assert_score(model, result_score, base_score, dataset)
@pytest.mark.case4
@pytest.mark.parametrize(
@ -286,13 +286,10 @@ class TestCmdCase:
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
assert_score(model, result_score, base_score, dataset)
THRESHOLD = 3
def assert_score(model_type, score, baseline):
def assert_score(model_type, score, baseline, dataset: str = ''):
if score is None or score == '-':
assert False, 'value is none'
@ -305,24 +302,33 @@ def assert_score(model_type, score, baseline):
print(' '.join([score, 'is not equal', str(baseline)]))
assert False, ' '.join([score, 'is not equal', str(baseline)])
else:
if float(score) <= (baseline + THRESHOLD) and float(score) >= (
baseline - THRESHOLD):
if dataset.startswith('dingo') or dataset.startswith(
'GPQA') or dataset.startswith('high') or dataset.startswith(
'mmlu_pro_') or dataset.startswith(
'alpaca_eval') or dataset.startswith('compassarena_'):
threshold = 5
elif dataset.startswith('humanevalx') or dataset == 'large_threshold':
threshold = 10
else:
threshold = 3
if float(score) <= (baseline + threshold) and float(score) >= (
baseline - threshold):
print(' '.join([
score, 'is between',
str(baseline - THRESHOLD), 'and',
str(baseline + THRESHOLD)
str(baseline - threshold), 'and',
str(baseline + threshold)
]))
assert True
else:
print(' '.join([
score, 'is not etween',
str(baseline - THRESHOLD), 'and',
str(baseline + THRESHOLD)
score, 'is not between',
str(baseline - threshold), 'and',
str(baseline + threshold)
]))
assert False, ' '.join([
score, 'is not etween',
str(baseline - THRESHOLD), 'and',
str(baseline + THRESHOLD)
score, 'is not between',
str(baseline - threshold), 'and',
str(baseline + threshold)
])

View File

@ -19,7 +19,7 @@ internlm2_5-7b-chat-lmdeploy:
race-high_accuracy: 90.54
internlm2-chat-1.8b-lmdeploy:
demo_gsm8k_accuracy: 32
demo_gsm8k_accuracy: 31
race-middle_accuracy: 81.34
race-high_accuracy: 73.96
@ -29,6 +29,6 @@ internlm2_5-7b-chat_hf:
race-high_accuracy: 90.48
lmdeploy-api-test:
gsm8k_accuracy: 83.78
race-middle_accuracy: 92.41
race-high_accuracy: 90.37
gsm8k_accuracy: 68.75
race-middle_accuracy: 87.50
race-high_accuracy: 93.75

View File

@ -299,93 +299,93 @@ internlm2_5-7b-chat-turbomind:
IFEval_Prompt-level-strict-accuracy: 58.04
drop_accuracy: 77.68
bbh_naive_average: 73.14
GPQA_diamond_accuracy: 25.76
GPQA_diamond_accuracy: 31.06
hellaswag_accuracy: 94.79
TheoremQA_score: 21.5
musr_average_naive_average: 51.03
korbench_single_naive_average: 31.92
ARC_Prize_Public_Evaluation_accuracy: 0.01
TheoremQA_score: 22.25
musr_average_naive_average: 50.89
korbench_single_naive_average: 32.16
ARC_Prize_Public_Evaluation_accuracy: 0.02
gsm8k_accuracy: 86.73
GaokaoBench_weighted_average: 77.89
math_accuracy: 61.5
cmo_fib_accuracy: 12.5
aime2024_accuracy: 3.33
Mathbench_naive_average: 65.17
wikibench-wiki-single_choice_cncircular_perf_4: 31.55
cmmlu_naive_average: 74.14
mmlu_naive_average: 70.52
GaokaoBench_weighted_average: 78.6
math_accuracy: 61
cmo_fib_accuracy: 11
aime2024_accuracy: 6.67
Mathbench_naive_average: 64.23
wikibench-wiki-single_choice_cncircular_perf_4: 31.32
cmmlu_naive_average: 74.3
mmlu_naive_average: 70.84
mmlu_pro_naive_average: 44.98
openai_humaneval_humaneval_pass@1: 70.73
sanitized_mbpp_score: 63.81
humanevalx_naive_average: 38.17
openai_humaneval_humaneval_pass@1: 69.8
sanitized_mbpp_score: 64.4
humanevalx_naive_average: 33.35
ds1000_naive_average: 14.15
lcb_code_generation_pass@1: 17.75
lcb_code_execution_pass@1: 32.57
lcb_test_output_pass@1: 24.89
bigcodebench_hard_instruct_pass@1: 0.08
bigcodebench_hard_complete_pass@1: 0.06
teval_naive_average: 80.03
lcb_test_output_pass@1: 26.13
bigcodebench_hard_instruct_pass@1: 8.45
bigcodebench_hard_complete_pass@1: 5.06
teval_naive_average: 80
SciCode_sub_accuracy: 5.56
qa_dingo_cn_score: 99.01
mmlu-stem_naive_average: 68.2
mmlu-social-science_naive_average: 76.11
mmlu-humanities_naive_average: 68.71
mmlu-other_naive_average: 70.56
cmmlu-stem_naive_average: 66.27
cmmlu-social-science_naive_average: 75.7
cmmlu-humanities_naive_average: 77.7
cmmlu-other_naive_average: 77.71
cmmlu-china-specific_naive_average: 72.94
mmlu_pro_biology_accuracy: 66.25
mmlu_pro_business_accuracy: 48.42
mmlu_pro_chemistry_accuracy: 35.25
mmlu_pro_computer_science_accuracy: 47.56
mmlu_pro_economics_accuracy: 55.92
mmlu_pro_engineering_accuracy: 30.44
mmlu_pro_health_accuracy: 45.97
mmlu_pro_history_accuracy: 41.21
mmlu-social-science_naive_average: 75.8
mmlu-humanities_naive_average: 69.3
mmlu-other_naive_average: 71.3
cmmlu-stem_naive_average: 66.64
cmmlu-social-science_naive_average: 76
cmmlu-humanities_naive_average: 77.9
cmmlu-other_naive_average: 77.25
cmmlu-china-specific_naive_average: 73.6
mmlu_pro_biology_accuracy: 66.67
mmlu_pro_business_accuracy: 47.91
mmlu_pro_chemistry_accuracy: 35
mmlu_pro_computer_science_accuracy: 48.9
mmlu_pro_economics_accuracy: 55.87
mmlu_pro_engineering_accuracy: 29.62
mmlu_pro_health_accuracy: 45
mmlu_pro_history_accuracy: 40.8
mmlu_pro_law_accuracy: 25.79
mmlu_pro_math_accuracy: 54.03
mmlu_pro_philosophy_accuracy: 36.47
mmlu_pro_physics_accuracy: 37.41
mmlu_pro_psychology_accuracy: 58.77
mmlu_pro_other_accuracy: 46.21
mmlu_pro_math_accuracy: 53.48
mmlu_pro_philosophy_accuracy: 38.38
mmlu_pro_physics_accuracy: 37.79
mmlu_pro_psychology_accuracy: 58.39
mmlu_pro_other_accuracy: 46.27
humanevalx-python_pass@1: 53.66
humanevalx-cpp_pass@1: 24.39
humanevalx-cpp_pass@1: 22.56
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 57.93
humanevalx-js_pass@1: 54.88
ds1000_Pandas_accuracy: 12.03
ds1000_Numpy_accuracy: 4.09
ds1000_Tensorflow_accuracy: 11.11
ds1000_Scipy_accuracy: 8.49
ds1000_Pandas_accuracy: 10.65
ds1000_Numpy_accuracy: 3.63
ds1000_Tensorflow_accuracy: 13.33
ds1000_Scipy_accuracy: 8.96
ds1000_Sklearn_accuracy: 6.96
ds1000_Pytorch_accuracy: 7.35
ds1000_Matplotlib_accuracy: 49.03
openai_mmmlu_lite_AR-XY_accuracy: 17.89
openai_mmmlu_lite_BN-BD_accuracy: 27.58
openai_mmmlu_lite_DE-DE_accuracy: 51.16
openai_mmmlu_lite_ES-LA_accuracy: 56.84
openai_mmmlu_lite_FR-FR_accuracy: 57.96
openai_mmmlu_lite_HI-IN_accuracy: 33.68
openai_mmmlu_lite_ID-ID_accuracy: 51.02
openai_mmmlu_lite_IT-IT_accuracy: 50.46
openai_mmmlu_lite_JA-JP_accuracy: 50.53
openai_mmmlu_lite_KO-KR_accuracy: 45.05
ds1000_Pytorch_accuracy: 6.62
ds1000_Matplotlib_accuracy: 49.35
openai_mmmlu_lite_AR-XY_accuracy: 17.19
openai_mmmlu_lite_BN-BD_accuracy: 26.78
openai_mmmlu_lite_DE-DE_accuracy: 51.27
openai_mmmlu_lite_ES-LA_accuracy: 56.94
openai_mmmlu_lite_FR-FR_accuracy: 58.22
openai_mmmlu_lite_HI-IN_accuracy: 33.75
openai_mmmlu_lite_ID-ID_accuracy: 50.6
openai_mmmlu_lite_IT-IT_accuracy: 50.6
openai_mmmlu_lite_JA-JP_accuracy: 51.13
openai_mmmlu_lite_KO-KR_accuracy: 45
openai_mmmlu_lite_PT-BR_accuracy: 57.68
openai_mmmlu_lite_SW-KE_accuracy: 32.77
openai_mmmlu_lite_YO-NG_accuracy: 31.79
openai_mmmlu_lite_ZH-CN_accuracy: 65.05
college_naive_average: 20.33
high_naive_average: 47.67
middle_naive_average: 62
primary_naive_average: 72
arithmetic_naive_average: 62.33
mathbench-a (average)_naive_average: 52.87
college_knowledge_naive_average: 70.57
high_knowledge_naive_average: 70.13
middle_knowledge_naive_average: 81.17
primary_knowledge_naive_average: 88.01
mathbench-t (average)_naive_average: 77.47
openai_mmmlu_lite_SW-KE_accuracy: 32.56
openai_mmmlu_lite_YO-NG_accuracy: 32.42
openai_mmmlu_lite_ZH-CN_accuracy: 65.4
college_naive_average: 19.17
high_naive_average: 46.5
middle_naive_average: 61.34
primary_naive_average: 73.34
arithmetic_naive_average: 61.67
mathbench-a (average)_naive_average: 52.58
college_knowledge_naive_average: 67.1
high_knowledge_naive_average: 70
middle_knowledge_naive_average: 80
primary_knowledge_naive_average: 87
mathbench-t (average)_naive_average: 76
subjective:
alignment_bench_v1_1_总分: 5.68
alpaca_eval_total: 25.96
@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind:
compassarena_knowledge_naive_average: 36
compassarena_reason_v2_naive_average: 35
compassarena_math_v2_naive_average: 19.91
compassarena_creationv2_zh_naive_average: 29.64
compassarena_creationv2_zh_naive_average: 35.81
fofo_test_prompts_overall: 0.35
fofo_test_prompts_cn_overall: 0.41
followbench_llmeval_en_HSR_AVG: 0.73

View File

@ -63,7 +63,7 @@ chat:
gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
llama-3_2-3b-instruct-hf:
gsm8k_accuracy: 68.75
gsm8k_accuracy: 65.62
race-high_accuracy: 81.25
llama-3-8b-instruct-hf:
gsm8k_accuracy: 68.75
@ -75,7 +75,7 @@ chat:
gsm8k_accuracy: 78.12
race-high_accuracy: 90.62
llama-3_2-3b-instruct-turbomind:
gsm8k_accuracy: 71.88
gsm8k_accuracy: 62.50
race-high_accuracy: 81.25
llama-3-8b-instruct-turbomind:
gsm8k_accuracy: 71.88
@ -98,15 +98,9 @@ chat:
mistral-7b-instruct-v0.2-vllm:
gsm8k_accuracy: 43.75
race-high_accuracy: 75
MiniCPM3-4B-hf:
gsm8k_accuracy: 68.75
race-high_accuracy: 84.38
phi-3-mini-4k-instruct-hf:
gsm8k_accuracy: 56.25
race-high_accuracy: 84.38
phi-3-small-8k-instruct-hf:
gsm8k_accuracy: 0
race-high_accuracy: 0
gsm8k_accuracy: 81.25
race-high_accuracy: 87.50
qwen2.5-0.5b-instruct-hf:
gsm8k_accuracy: 34.38
race-high_accuracy: 46.88
@ -321,21 +315,11 @@ base:
GPQA_diamond_accuracy: 12.50
race-high_accuracy: 65.62
winogrande_accuracy: 78.12
mistral-7b-v0.2-hf:
gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 6.25
race-high_accuracy: 62.5
winogrande_accuracy: 59.38
mistral-7b-v0.3-hf:
gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 6.25
race-high_accuracy: 62.5
winogrande_accuracy: 59.38
mistral-7b-v0.2-vllm:
gsm8k_accuracy: 34.38
GPQA_diamond_accuracy: 6.25
race-high_accuracy: 62.5
winogrande_accuracy: 65.62
qwen2.5-7b-hf:
gsm8k_accuracy: 81.25
GPQA_diamond_accuracy: 18.75
@ -457,10 +441,10 @@ base:
race-high_accuracy: 93.75
winogrande_accuracy: 87.5
deepseek-v2-turbomind:
gsm8k_accuracy: 62.5
gsm8k_accuracy: 71.88
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 81.25
winogrande_accuracy: 68.75
winogrande_accuracy: 75
llama-3-70b-hf:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 3.12

View File

@ -92,6 +92,7 @@ jobs:
matrix:
pyver: [py310]
runs-on: ubuntu-latest
environment: 'prod'
env:
PYTHON_VERSION: ${{ matrix.pyver }}
PLAT_NAME: manylinux2014_x86_64
@ -187,7 +188,7 @@ jobs:
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
runs-on: volc_cu12_daily
environment: 'prod'
timeout-minutes: 240 #4hours
timeout-minutes: 120 #2hours
steps:
- name: Clone repository
uses: actions/checkout@v2
@ -321,7 +322,7 @@ jobs:
uses: nick-fields/retry@v3
with:
max_attempts: 1
timeout_minutes: 240
timeout_minutes: 360
command: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
@ -335,7 +336,6 @@ jobs:
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
environment: 'prod'
timeout-minutes: 5
runs-on: self-hosted
steps:

100
.github/workflows/pr-run-test-prod.yml vendored Normal file
View File

@ -0,0 +1,100 @@
name: pr_run_test-prod
on:
pull_request:
paths:
- '.github/**'
workflow_dispatch:
schedule:
- cron: '56 22 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
CONDA_ENV: pr_test
HF_DATASETS_OFFLINE: 1
HF_EVALUATE_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1
VLLM_USE_MODELSCOPE: false
LMDEPLOY_USE_MODELSCOPE: false
HF_HUB_OFFLINE: 1
CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/prtest
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
jobs:
pr_run_test:
runs-on: volc_cu12_local
environment: 'prod'
timeout-minutes: 30
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Prepare - Install opencompass
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y
python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
- name: conda env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
lmdeploy check_env
- name: Run test
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
rm -rf regression_result
opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1 --debug
opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2 --debug --max-num-workers 2
opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3 --debug --max-num-workers 2
- name: Get result
run: |
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then
echo "score is $score between 88 and 89"
else
echo "score is $score not between 88 and 89"
exit 1
fi
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then
echo "score is $score between 87 and 88"
else
echo "score is $score not between 87 and 88"
exit 1
fi
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 87 && ${score%.*} <= 91 )); then
echo "score is $score between 87 and 91"
else
echo "score is $score not between 87 and 91"
exit 1
fi
- name: Uninstall opencompass
if: always()
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y
conda info --envs
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [pr_run_test]
timeout-minutes: 5
runs-on: self-hosted
steps:
- name: notify
run: |
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}

View File

@ -8,10 +8,9 @@ on:
- 'docs/**'
- 'configs/**'
- 'tools/**'
paths:
- '!.github/**'
workflow_dispatch:
schedule:
- cron: '56 22 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@ -35,7 +34,6 @@ env:
jobs:
pr_run_test:
runs-on: volc_cu12_local
environment: 'prod'
timeout-minutes: 30
steps:
- name: Checkout repository
@ -97,7 +95,6 @@ jobs:
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [pr_run_test]
environment: 'prod'
timeout-minutes: 5
runs-on: self-hosted
steps: