mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[CI] Update daily test metrics threshold (#1812)
* Update daily-run-test.yml * Update pr-run-test.yml * update * update * update * updaet * update * update * update * update * update * update * update --------- Co-authored-by: MaiziXiao <xxllcc1993@gmail.com>
This commit is contained in:
parent
e039f3efa0
commit
abdcee68f6
3
.github/scripts/eval_regression_api.py
vendored
3
.github/scripts/eval_regression_api.py
vendored
@ -37,3 +37,6 @@ models = [
|
||||
retry=20,
|
||||
)
|
||||
]
|
||||
|
||||
for d in datasets:
|
||||
d['reader_cfg']['test_range'] = '[0:16]'
|
||||
|
@ -79,12 +79,8 @@ with read_base():
|
||||
models as lmdeploy_llama3_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \
|
||||
models as lmdeploy_llama3_70b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
|
||||
models as hf_mistral_7b_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
|
||||
models as hf_mistral_7b_v0_3_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
|
||||
models as vllm_mistral_7b_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \
|
||||
models as hf_qwen_2_5_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
|
||||
|
@ -101,12 +101,8 @@ with read_base():
|
||||
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
|
||||
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
|
||||
from opencompass.configs.models.openbmb.hf_minicpm3_4b import \
|
||||
models as hf_minicpm3_4b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
|
||||
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
|
||||
models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
|
||||
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
|
||||
|
64
.github/scripts/oc_score_assert.py
vendored
64
.github/scripts/oc_score_assert.py
vendored
@ -72,7 +72,7 @@ class TestChat:
|
||||
base_score = baseline_scores_testrange.get('chat').get(model).get(
|
||||
dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score)
|
||||
assert_score(model, result_score, base_score, dataset)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@ -94,7 +94,7 @@ class TestBase:
|
||||
base_score = baseline_scores_testrange.get('base').get(model).get(
|
||||
dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score)
|
||||
assert_score(model, result_score, base_score, dataset)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@ -112,7 +112,7 @@ class TestChatObjFullbench:
|
||||
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
||||
dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score)
|
||||
assert_score(model, result_score, base_score, dataset)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@ -131,7 +131,7 @@ class TestChatSubFullbench:
|
||||
base_score = baseline_scores_fullbench.get(model).get(
|
||||
'subjective').get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score)
|
||||
assert_score(model, result_score, base_score, dataset)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@ -150,7 +150,7 @@ class TestBaseFullbench:
|
||||
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
||||
dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score)
|
||||
assert_score(model, result_score, base_score, dataset)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@ -166,7 +166,7 @@ class TestApibench:
|
||||
def test_api(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model + '_batch', result_score, base_score)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@ -185,7 +185,7 @@ class TestVolcFullbench:
|
||||
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
||||
dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model + '_batch', result_score, base_score)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
@pytest.mark.parametrize('model, dataset', [
|
||||
(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
|
||||
@ -197,7 +197,7 @@ class TestVolcFullbench:
|
||||
base_score = baseline_scores_fullbench.get(model).get(
|
||||
'subjective').get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model + '_batch', result_score, base_score)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset',
|
||||
@ -209,7 +209,7 @@ class TestVolcFullbench:
|
||||
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
||||
dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model + '_batch', result_score, base_score)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset',
|
||||
@ -221,7 +221,7 @@ class TestVolcFullbench:
|
||||
base_score = baseline_scores_fullbench.get(model).get(
|
||||
'long_context').get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model + '_batch', result_score, base_score)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset',
|
||||
@ -234,7 +234,7 @@ class TestVolcFullbench:
|
||||
base_score = baseline_scores_fullbench.get(model).get(
|
||||
'long_context').get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model + '_batch', result_score, base_score)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@ -252,7 +252,7 @@ class TestCmdCase:
|
||||
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score)
|
||||
assert_score(model, result_score, base_score, dataset)
|
||||
|
||||
@pytest.mark.case2
|
||||
@pytest.mark.parametrize(
|
||||
@ -266,7 +266,7 @@ class TestCmdCase:
|
||||
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model + '_batch', result_score, base_score)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
@pytest.mark.case3
|
||||
@pytest.mark.parametrize('model, dataset',
|
||||
@ -276,7 +276,7 @@ class TestCmdCase:
|
||||
def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score)
|
||||
assert_score(model, result_score, base_score, dataset)
|
||||
|
||||
@pytest.mark.case4
|
||||
@pytest.mark.parametrize(
|
||||
@ -286,13 +286,10 @@ class TestCmdCase:
|
||||
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score)
|
||||
assert_score(model, result_score, base_score, dataset)
|
||||
|
||||
|
||||
THRESHOLD = 3
|
||||
|
||||
|
||||
def assert_score(model_type, score, baseline):
|
||||
def assert_score(model_type, score, baseline, dataset: str = ''):
|
||||
if score is None or score == '-':
|
||||
assert False, 'value is none'
|
||||
|
||||
@ -305,24 +302,33 @@ def assert_score(model_type, score, baseline):
|
||||
print(' '.join([score, 'is not equal', str(baseline)]))
|
||||
assert False, ' '.join([score, 'is not equal', str(baseline)])
|
||||
else:
|
||||
if float(score) <= (baseline + THRESHOLD) and float(score) >= (
|
||||
baseline - THRESHOLD):
|
||||
if dataset.startswith('dingo') or dataset.startswith(
|
||||
'GPQA') or dataset.startswith('high') or dataset.startswith(
|
||||
'mmlu_pro_') or dataset.startswith(
|
||||
'alpaca_eval') or dataset.startswith('compassarena_'):
|
||||
threshold = 5
|
||||
elif dataset.startswith('humanevalx') or dataset == 'large_threshold':
|
||||
threshold = 10
|
||||
else:
|
||||
threshold = 3
|
||||
if float(score) <= (baseline + threshold) and float(score) >= (
|
||||
baseline - threshold):
|
||||
print(' '.join([
|
||||
score, 'is between',
|
||||
str(baseline - THRESHOLD), 'and',
|
||||
str(baseline + THRESHOLD)
|
||||
str(baseline - threshold), 'and',
|
||||
str(baseline + threshold)
|
||||
]))
|
||||
assert True
|
||||
else:
|
||||
print(' '.join([
|
||||
score, 'is not etween',
|
||||
str(baseline - THRESHOLD), 'and',
|
||||
str(baseline + THRESHOLD)
|
||||
score, 'is not between',
|
||||
str(baseline - threshold), 'and',
|
||||
str(baseline + threshold)
|
||||
]))
|
||||
assert False, ' '.join([
|
||||
score, 'is not etween',
|
||||
str(baseline - THRESHOLD), 'and',
|
||||
str(baseline + THRESHOLD)
|
||||
score, 'is not between',
|
||||
str(baseline - threshold), 'and',
|
||||
str(baseline + threshold)
|
||||
])
|
||||
|
||||
|
||||
|
8
.github/scripts/oc_score_baseline.yaml
vendored
8
.github/scripts/oc_score_baseline.yaml
vendored
@ -19,7 +19,7 @@ internlm2_5-7b-chat-lmdeploy:
|
||||
race-high_accuracy: 90.54
|
||||
|
||||
internlm2-chat-1.8b-lmdeploy:
|
||||
demo_gsm8k_accuracy: 32
|
||||
demo_gsm8k_accuracy: 31
|
||||
race-middle_accuracy: 81.34
|
||||
race-high_accuracy: 73.96
|
||||
|
||||
@ -29,6 +29,6 @@ internlm2_5-7b-chat_hf:
|
||||
race-high_accuracy: 90.48
|
||||
|
||||
lmdeploy-api-test:
|
||||
gsm8k_accuracy: 83.78
|
||||
race-middle_accuracy: 92.41
|
||||
race-high_accuracy: 90.37
|
||||
gsm8k_accuracy: 68.75
|
||||
race-middle_accuracy: 87.50
|
||||
race-high_accuracy: 93.75
|
||||
|
148
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
148
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
@ -299,93 +299,93 @@ internlm2_5-7b-chat-turbomind:
|
||||
IFEval_Prompt-level-strict-accuracy: 58.04
|
||||
drop_accuracy: 77.68
|
||||
bbh_naive_average: 73.14
|
||||
GPQA_diamond_accuracy: 25.76
|
||||
GPQA_diamond_accuracy: 31.06
|
||||
hellaswag_accuracy: 94.79
|
||||
TheoremQA_score: 21.5
|
||||
musr_average_naive_average: 51.03
|
||||
korbench_single_naive_average: 31.92
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0.01
|
||||
TheoremQA_score: 22.25
|
||||
musr_average_naive_average: 50.89
|
||||
korbench_single_naive_average: 32.16
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0.02
|
||||
gsm8k_accuracy: 86.73
|
||||
GaokaoBench_weighted_average: 77.89
|
||||
math_accuracy: 61.5
|
||||
cmo_fib_accuracy: 12.5
|
||||
aime2024_accuracy: 3.33
|
||||
Mathbench_naive_average: 65.17
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 31.55
|
||||
cmmlu_naive_average: 74.14
|
||||
mmlu_naive_average: 70.52
|
||||
GaokaoBench_weighted_average: 78.6
|
||||
math_accuracy: 61
|
||||
cmo_fib_accuracy: 11
|
||||
aime2024_accuracy: 6.67
|
||||
Mathbench_naive_average: 64.23
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 31.32
|
||||
cmmlu_naive_average: 74.3
|
||||
mmlu_naive_average: 70.84
|
||||
mmlu_pro_naive_average: 44.98
|
||||
openai_humaneval_humaneval_pass@1: 70.73
|
||||
sanitized_mbpp_score: 63.81
|
||||
humanevalx_naive_average: 38.17
|
||||
openai_humaneval_humaneval_pass@1: 69.8
|
||||
sanitized_mbpp_score: 64.4
|
||||
humanevalx_naive_average: 33.35
|
||||
ds1000_naive_average: 14.15
|
||||
lcb_code_generation_pass@1: 17.75
|
||||
lcb_code_execution_pass@1: 32.57
|
||||
lcb_test_output_pass@1: 24.89
|
||||
bigcodebench_hard_instruct_pass@1: 0.08
|
||||
bigcodebench_hard_complete_pass@1: 0.06
|
||||
teval_naive_average: 80.03
|
||||
lcb_test_output_pass@1: 26.13
|
||||
bigcodebench_hard_instruct_pass@1: 8.45
|
||||
bigcodebench_hard_complete_pass@1: 5.06
|
||||
teval_naive_average: 80
|
||||
SciCode_sub_accuracy: 5.56
|
||||
qa_dingo_cn_score: 99.01
|
||||
mmlu-stem_naive_average: 68.2
|
||||
mmlu-social-science_naive_average: 76.11
|
||||
mmlu-humanities_naive_average: 68.71
|
||||
mmlu-other_naive_average: 70.56
|
||||
cmmlu-stem_naive_average: 66.27
|
||||
cmmlu-social-science_naive_average: 75.7
|
||||
cmmlu-humanities_naive_average: 77.7
|
||||
cmmlu-other_naive_average: 77.71
|
||||
cmmlu-china-specific_naive_average: 72.94
|
||||
mmlu_pro_biology_accuracy: 66.25
|
||||
mmlu_pro_business_accuracy: 48.42
|
||||
mmlu_pro_chemistry_accuracy: 35.25
|
||||
mmlu_pro_computer_science_accuracy: 47.56
|
||||
mmlu_pro_economics_accuracy: 55.92
|
||||
mmlu_pro_engineering_accuracy: 30.44
|
||||
mmlu_pro_health_accuracy: 45.97
|
||||
mmlu_pro_history_accuracy: 41.21
|
||||
mmlu-social-science_naive_average: 75.8
|
||||
mmlu-humanities_naive_average: 69.3
|
||||
mmlu-other_naive_average: 71.3
|
||||
cmmlu-stem_naive_average: 66.64
|
||||
cmmlu-social-science_naive_average: 76
|
||||
cmmlu-humanities_naive_average: 77.9
|
||||
cmmlu-other_naive_average: 77.25
|
||||
cmmlu-china-specific_naive_average: 73.6
|
||||
mmlu_pro_biology_accuracy: 66.67
|
||||
mmlu_pro_business_accuracy: 47.91
|
||||
mmlu_pro_chemistry_accuracy: 35
|
||||
mmlu_pro_computer_science_accuracy: 48.9
|
||||
mmlu_pro_economics_accuracy: 55.87
|
||||
mmlu_pro_engineering_accuracy: 29.62
|
||||
mmlu_pro_health_accuracy: 45
|
||||
mmlu_pro_history_accuracy: 40.8
|
||||
mmlu_pro_law_accuracy: 25.79
|
||||
mmlu_pro_math_accuracy: 54.03
|
||||
mmlu_pro_philosophy_accuracy: 36.47
|
||||
mmlu_pro_physics_accuracy: 37.41
|
||||
mmlu_pro_psychology_accuracy: 58.77
|
||||
mmlu_pro_other_accuracy: 46.21
|
||||
mmlu_pro_math_accuracy: 53.48
|
||||
mmlu_pro_philosophy_accuracy: 38.38
|
||||
mmlu_pro_physics_accuracy: 37.79
|
||||
mmlu_pro_psychology_accuracy: 58.39
|
||||
mmlu_pro_other_accuracy: 46.27
|
||||
humanevalx-python_pass@1: 53.66
|
||||
humanevalx-cpp_pass@1: 24.39
|
||||
humanevalx-cpp_pass@1: 22.56
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 57.93
|
||||
humanevalx-js_pass@1: 54.88
|
||||
ds1000_Pandas_accuracy: 12.03
|
||||
ds1000_Numpy_accuracy: 4.09
|
||||
ds1000_Tensorflow_accuracy: 11.11
|
||||
ds1000_Scipy_accuracy: 8.49
|
||||
ds1000_Pandas_accuracy: 10.65
|
||||
ds1000_Numpy_accuracy: 3.63
|
||||
ds1000_Tensorflow_accuracy: 13.33
|
||||
ds1000_Scipy_accuracy: 8.96
|
||||
ds1000_Sklearn_accuracy: 6.96
|
||||
ds1000_Pytorch_accuracy: 7.35
|
||||
ds1000_Matplotlib_accuracy: 49.03
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 17.89
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 27.58
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 51.16
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 56.84
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 57.96
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 33.68
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 51.02
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 50.46
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 50.53
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 45.05
|
||||
ds1000_Pytorch_accuracy: 6.62
|
||||
ds1000_Matplotlib_accuracy: 49.35
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 17.19
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 26.78
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 51.27
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 56.94
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 58.22
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 33.75
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 50.6
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 50.6
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 51.13
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 45
|
||||
openai_mmmlu_lite_PT-BR_accuracy: 57.68
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 32.77
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 31.79
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 65.05
|
||||
college_naive_average: 20.33
|
||||
high_naive_average: 47.67
|
||||
middle_naive_average: 62
|
||||
primary_naive_average: 72
|
||||
arithmetic_naive_average: 62.33
|
||||
mathbench-a (average)_naive_average: 52.87
|
||||
college_knowledge_naive_average: 70.57
|
||||
high_knowledge_naive_average: 70.13
|
||||
middle_knowledge_naive_average: 81.17
|
||||
primary_knowledge_naive_average: 88.01
|
||||
mathbench-t (average)_naive_average: 77.47
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 32.56
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 32.42
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 65.4
|
||||
college_naive_average: 19.17
|
||||
high_naive_average: 46.5
|
||||
middle_naive_average: 61.34
|
||||
primary_naive_average: 73.34
|
||||
arithmetic_naive_average: 61.67
|
||||
mathbench-a (average)_naive_average: 52.58
|
||||
college_knowledge_naive_average: 67.1
|
||||
high_knowledge_naive_average: 70
|
||||
middle_knowledge_naive_average: 80
|
||||
primary_knowledge_naive_average: 87
|
||||
mathbench-t (average)_naive_average: 76
|
||||
subjective:
|
||||
alignment_bench_v1_1_总分: 5.68
|
||||
alpaca_eval_total: 25.96
|
||||
@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
compassarena_knowledge_naive_average: 36
|
||||
compassarena_reason_v2_naive_average: 35
|
||||
compassarena_math_v2_naive_average: 19.91
|
||||
compassarena_creationv2_zh_naive_average: 29.64
|
||||
compassarena_creationv2_zh_naive_average: 35.81
|
||||
fofo_test_prompts_overall: 0.35
|
||||
fofo_test_prompts_cn_overall: 0.41
|
||||
followbench_llmeval_en_HSR_AVG: 0.73
|
||||
|
28
.github/scripts/oc_score_baseline_testrange.yaml
vendored
28
.github/scripts/oc_score_baseline_testrange.yaml
vendored
@ -63,7 +63,7 @@ chat:
|
||||
gsm8k_accuracy: 84.38
|
||||
race-high_accuracy: 90.62
|
||||
llama-3_2-3b-instruct-hf:
|
||||
gsm8k_accuracy: 68.75
|
||||
gsm8k_accuracy: 65.62
|
||||
race-high_accuracy: 81.25
|
||||
llama-3-8b-instruct-hf:
|
||||
gsm8k_accuracy: 68.75
|
||||
@ -75,7 +75,7 @@ chat:
|
||||
gsm8k_accuracy: 78.12
|
||||
race-high_accuracy: 90.62
|
||||
llama-3_2-3b-instruct-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
gsm8k_accuracy: 62.50
|
||||
race-high_accuracy: 81.25
|
||||
llama-3-8b-instruct-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
@ -98,15 +98,9 @@ chat:
|
||||
mistral-7b-instruct-v0.2-vllm:
|
||||
gsm8k_accuracy: 43.75
|
||||
race-high_accuracy: 75
|
||||
MiniCPM3-4B-hf:
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 84.38
|
||||
phi-3-mini-4k-instruct-hf:
|
||||
gsm8k_accuracy: 56.25
|
||||
race-high_accuracy: 84.38
|
||||
phi-3-small-8k-instruct-hf:
|
||||
gsm8k_accuracy: 0
|
||||
race-high_accuracy: 0
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 87.50
|
||||
qwen2.5-0.5b-instruct-hf:
|
||||
gsm8k_accuracy: 34.38
|
||||
race-high_accuracy: 46.88
|
||||
@ -321,21 +315,11 @@ base:
|
||||
GPQA_diamond_accuracy: 12.50
|
||||
race-high_accuracy: 65.62
|
||||
winogrande_accuracy: 78.12
|
||||
mistral-7b-v0.2-hf:
|
||||
gsm8k_accuracy: 31.25
|
||||
GPQA_diamond_accuracy: 6.25
|
||||
race-high_accuracy: 62.5
|
||||
winogrande_accuracy: 59.38
|
||||
mistral-7b-v0.3-hf:
|
||||
gsm8k_accuracy: 31.25
|
||||
GPQA_diamond_accuracy: 6.25
|
||||
race-high_accuracy: 62.5
|
||||
winogrande_accuracy: 59.38
|
||||
mistral-7b-v0.2-vllm:
|
||||
gsm8k_accuracy: 34.38
|
||||
GPQA_diamond_accuracy: 6.25
|
||||
race-high_accuracy: 62.5
|
||||
winogrande_accuracy: 65.62
|
||||
qwen2.5-7b-hf:
|
||||
gsm8k_accuracy: 81.25
|
||||
GPQA_diamond_accuracy: 18.75
|
||||
@ -457,10 +441,10 @@ base:
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 87.5
|
||||
deepseek-v2-turbomind:
|
||||
gsm8k_accuracy: 62.5
|
||||
gsm8k_accuracy: 71.88
|
||||
GPQA_diamond_accuracy: 3.12
|
||||
race-high_accuracy: 81.25
|
||||
winogrande_accuracy: 68.75
|
||||
winogrande_accuracy: 75
|
||||
llama-3-70b-hf:
|
||||
gsm8k_accuracy: 62.5
|
||||
GPQA_diamond_accuracy: 3.12
|
||||
|
6
.github/workflows/daily-run-test.yml
vendored
6
.github/workflows/daily-run-test.yml
vendored
@ -92,6 +92,7 @@ jobs:
|
||||
matrix:
|
||||
pyver: [py310]
|
||||
runs-on: ubuntu-latest
|
||||
environment: 'prod'
|
||||
env:
|
||||
PYTHON_VERSION: ${{ matrix.pyver }}
|
||||
PLAT_NAME: manylinux2014_x86_64
|
||||
@ -187,7 +188,7 @@ jobs:
|
||||
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
|
||||
runs-on: volc_cu12_daily
|
||||
environment: 'prod'
|
||||
timeout-minutes: 240 #4hours
|
||||
timeout-minutes: 120 #2hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
uses: actions/checkout@v2
|
||||
@ -321,7 +322,7 @@ jobs:
|
||||
uses: nick-fields/retry@v3
|
||||
with:
|
||||
max_attempts: 1
|
||||
timeout_minutes: 240
|
||||
timeout_minutes: 360
|
||||
command: |
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
@ -335,7 +336,6 @@ jobs:
|
||||
notify_to_feishu:
|
||||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
||||
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
|
||||
environment: 'prod'
|
||||
timeout-minutes: 5
|
||||
runs-on: self-hosted
|
||||
steps:
|
||||
|
100
.github/workflows/pr-run-test-prod.yml
vendored
Normal file
100
.github/workflows/pr-run-test-prod.yml
vendored
Normal file
@ -0,0 +1,100 @@
|
||||
name: pr_run_test-prod
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/**'
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '56 22 * * *'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CONDA_ENV: pr_test
|
||||
HF_DATASETS_OFFLINE: 1
|
||||
HF_EVALUATE_OFFLINE: 1
|
||||
TRANSFORMERS_OFFLINE: 1
|
||||
VLLM_USE_MODELSCOPE: false
|
||||
LMDEPLOY_USE_MODELSCOPE: false
|
||||
HF_HUB_OFFLINE: 1
|
||||
CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
|
||||
PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
|
||||
REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/prtest
|
||||
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
|
||||
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||
|
||||
jobs:
|
||||
pr_run_test:
|
||||
runs-on: volc_cu12_local
|
||||
environment: 'prod'
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
- name: Prepare - Install opencompass
|
||||
run: |
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
python3 -m pip uninstall opencompass -y
|
||||
python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
conda info --envs
|
||||
- name: conda env
|
||||
run: |
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
conda info --envs
|
||||
pip list
|
||||
lmdeploy check_env
|
||||
- name: Run test
|
||||
run: |
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
conda info --envs
|
||||
rm -rf regression_result
|
||||
opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1 --debug
|
||||
opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2 --debug --max-num-workers 2
|
||||
opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3 --debug --max-num-workers 2
|
||||
- name: Get result
|
||||
run: |
|
||||
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}')
|
||||
if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then
|
||||
echo "score is $score between 88 and 89"
|
||||
else
|
||||
echo "score is $score not between 88 and 89"
|
||||
exit 1
|
||||
fi
|
||||
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}')
|
||||
if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then
|
||||
echo "score is $score between 87 and 88"
|
||||
else
|
||||
echo "score is $score not between 87 and 88"
|
||||
exit 1
|
||||
fi
|
||||
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}')
|
||||
if (( ${score%.*} >= 87 && ${score%.*} <= 91 )); then
|
||||
echo "score is $score between 87 and 91"
|
||||
else
|
||||
echo "score is $score not between 87 and 91"
|
||||
exit 1
|
||||
fi
|
||||
- name: Uninstall opencompass
|
||||
if: always()
|
||||
run: |
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
python3 -m pip uninstall opencompass -y
|
||||
conda info --envs
|
||||
|
||||
notify_to_feishu:
|
||||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
||||
needs: [pr_run_test]
|
||||
timeout-minutes: 5
|
||||
runs-on: self-hosted
|
||||
steps:
|
||||
- name: notify
|
||||
run: |
|
||||
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
|
7
.github/workflows/pr-run-test.yml
vendored
7
.github/workflows/pr-run-test.yml
vendored
@ -8,10 +8,9 @@ on:
|
||||
- 'docs/**'
|
||||
- 'configs/**'
|
||||
- 'tools/**'
|
||||
paths:
|
||||
- '!.github/**'
|
||||
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '56 22 * * *'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
@ -35,7 +34,6 @@ env:
|
||||
jobs:
|
||||
pr_run_test:
|
||||
runs-on: volc_cu12_local
|
||||
environment: 'prod'
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
@ -97,7 +95,6 @@ jobs:
|
||||
notify_to_feishu:
|
||||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
||||
needs: [pr_run_test]
|
||||
environment: 'prod'
|
||||
timeout-minutes: 5
|
||||
runs-on: self-hosted
|
||||
steps:
|
||||
|
Loading…
Reference in New Issue
Block a user