diff --git a/.github/scripts/eval_regression_api.py b/.github/scripts/eval_regression_api.py index db4f0ab2..29ec5406 100644 --- a/.github/scripts/eval_regression_api.py +++ b/.github/scripts/eval_regression_api.py @@ -37,3 +37,6 @@ models = [ retry=20, ) ] + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' diff --git a/.github/scripts/eval_regression_base_models.py b/.github/scripts/eval_regression_base_models.py index 08413707..a8dc7a60 100644 --- a/.github/scripts/eval_regression_base_models.py +++ b/.github/scripts/eval_regression_base_models.py @@ -79,12 +79,8 @@ with read_base(): models as lmdeploy_llama3_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \ models as lmdeploy_llama3_70b_model # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \ - models as hf_mistral_7b_v0_2_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \ models as hf_mistral_7b_v0_3_model # noqa: F401, E501 - from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \ - models as vllm_mistral_7b_v0_2_model # noqa: F401, E501 from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \ models as hf_qwen_2_5_7b_model # noqa: F401, E501 from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \ diff --git a/.github/scripts/eval_regression_chat_models.py b/.github/scripts/eval_regression_chat_models.py index eeade13f..40ec1bc5 100644 --- a/.github/scripts/eval_regression_chat_models.py +++ b/.github/scripts/eval_regression_chat_models.py @@ -101,12 +101,8 @@ with read_base(): models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501 from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \ models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501 - from opencompass.configs.models.openbmb.hf_minicpm3_4b import \ - models as hf_minicpm3_4b_model # noqa: F401, E501 from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \ models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501 - from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \ - models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \ models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \ diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index 6ad6e295..4ef414dc 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -72,7 +72,7 @@ class TestChat: base_score = baseline_scores_testrange.get('chat').get(model).get( dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model, result_score, base_score) + assert_score(model, result_score, base_score, dataset) @pytest.mark.usefixtures('result_scores') @@ -94,7 +94,7 @@ class TestBase: base_score = baseline_scores_testrange.get('base').get(model).get( dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model, result_score, base_score) + assert_score(model, result_score, base_score, dataset) @pytest.mark.usefixtures('result_scores') @@ -112,7 +112,7 @@ class TestChatObjFullbench: base_score = baseline_scores_fullbench.get(model).get('objective').get( dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model, result_score, base_score) + assert_score(model, result_score, base_score, dataset) @pytest.mark.usefixtures('result_scores') @@ -131,7 +131,7 @@ class TestChatSubFullbench: base_score = baseline_scores_fullbench.get(model).get( 'subjective').get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model, result_score, base_score) + assert_score(model, result_score, base_score, dataset) @pytest.mark.usefixtures('result_scores') @@ -150,7 +150,7 @@ class TestBaseFullbench: base_score = baseline_scores_fullbench.get(model).get('objective').get( dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model, result_score, base_score) + assert_score(model, result_score, base_score, dataset) @pytest.mark.usefixtures('result_scores') @@ -166,7 +166,7 @@ class TestApibench: def test_api(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model + '_batch', result_score, base_score) + assert_score(model + '_batch', result_score, base_score, dataset) @pytest.mark.usefixtures('result_scores') @@ -185,7 +185,7 @@ class TestVolcFullbench: base_score = baseline_scores_fullbench.get(model).get('objective').get( dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model + '_batch', result_score, base_score) + assert_score(model + '_batch', result_score, base_score, dataset) @pytest.mark.parametrize('model, dataset', [ (p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind'] @@ -197,7 +197,7 @@ class TestVolcFullbench: base_score = baseline_scores_fullbench.get(model).get( 'subjective').get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model + '_batch', result_score, base_score) + assert_score(model + '_batch', result_score, base_score, dataset) @pytest.mark.parametrize( 'model, dataset', @@ -209,7 +209,7 @@ class TestVolcFullbench: base_score = baseline_scores_fullbench.get(model).get('objective').get( dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model + '_batch', result_score, base_score) + assert_score(model + '_batch', result_score, base_score, dataset) @pytest.mark.parametrize( 'model, dataset', @@ -221,7 +221,7 @@ class TestVolcFullbench: base_score = baseline_scores_fullbench.get(model).get( 'long_context').get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model + '_batch', result_score, base_score) + assert_score(model + '_batch', result_score, base_score, dataset) @pytest.mark.parametrize( 'model, dataset', @@ -234,7 +234,7 @@ class TestVolcFullbench: base_score = baseline_scores_fullbench.get(model).get( 'long_context').get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model + '_batch', result_score, base_score) + assert_score(model + '_batch', result_score, base_score, dataset) @pytest.mark.usefixtures('result_scores') @@ -252,7 +252,7 @@ class TestCmdCase: def test_cmd_case1(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model, result_score, base_score) + assert_score(model, result_score, base_score, dataset) @pytest.mark.case2 @pytest.mark.parametrize( @@ -266,7 +266,7 @@ class TestCmdCase: def test_cmd_case2(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model + '_batch', result_score, base_score) + assert_score(model + '_batch', result_score, base_score, dataset) @pytest.mark.case3 @pytest.mark.parametrize('model, dataset', @@ -276,7 +276,7 @@ class TestCmdCase: def test_cmd_case3(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model, result_score, base_score) + assert_score(model, result_score, base_score, dataset) @pytest.mark.case4 @pytest.mark.parametrize( @@ -286,13 +286,10 @@ class TestCmdCase: def test_cmd_case4(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model, result_score, base_score) + assert_score(model, result_score, base_score, dataset) -THRESHOLD = 3 - - -def assert_score(model_type, score, baseline): +def assert_score(model_type, score, baseline, dataset: str = ''): if score is None or score == '-': assert False, 'value is none' @@ -305,24 +302,33 @@ def assert_score(model_type, score, baseline): print(' '.join([score, 'is not equal', str(baseline)])) assert False, ' '.join([score, 'is not equal', str(baseline)]) else: - if float(score) <= (baseline + THRESHOLD) and float(score) >= ( - baseline - THRESHOLD): + if dataset.startswith('dingo') or dataset.startswith( + 'GPQA') or dataset.startswith('high') or dataset.startswith( + 'mmlu_pro_') or dataset.startswith( + 'alpaca_eval') or dataset.startswith('compassarena_'): + threshold = 5 + elif dataset.startswith('humanevalx') or dataset == 'large_threshold': + threshold = 10 + else: + threshold = 3 + if float(score) <= (baseline + threshold) and float(score) >= ( + baseline - threshold): print(' '.join([ score, 'is between', - str(baseline - THRESHOLD), 'and', - str(baseline + THRESHOLD) + str(baseline - threshold), 'and', + str(baseline + threshold) ])) assert True else: print(' '.join([ - score, 'is not etween', - str(baseline - THRESHOLD), 'and', - str(baseline + THRESHOLD) + score, 'is not between', + str(baseline - threshold), 'and', + str(baseline + threshold) ])) assert False, ' '.join([ - score, 'is not etween', - str(baseline - THRESHOLD), 'and', - str(baseline + THRESHOLD) + score, 'is not between', + str(baseline - threshold), 'and', + str(baseline + threshold) ]) diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index a8e40891..cd2e3328 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -19,7 +19,7 @@ internlm2_5-7b-chat-lmdeploy: race-high_accuracy: 90.54 internlm2-chat-1.8b-lmdeploy: - demo_gsm8k_accuracy: 32 + demo_gsm8k_accuracy: 31 race-middle_accuracy: 81.34 race-high_accuracy: 73.96 @@ -29,6 +29,6 @@ internlm2_5-7b-chat_hf: race-high_accuracy: 90.48 lmdeploy-api-test: - gsm8k_accuracy: 83.78 - race-middle_accuracy: 92.41 - race-high_accuracy: 90.37 + gsm8k_accuracy: 68.75 + race-middle_accuracy: 87.50 + race-high_accuracy: 93.75 diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 568ed5fd..6ab32832 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -299,93 +299,93 @@ internlm2_5-7b-chat-turbomind: IFEval_Prompt-level-strict-accuracy: 58.04 drop_accuracy: 77.68 bbh_naive_average: 73.14 - GPQA_diamond_accuracy: 25.76 + GPQA_diamond_accuracy: 31.06 hellaswag_accuracy: 94.79 - TheoremQA_score: 21.5 - musr_average_naive_average: 51.03 - korbench_single_naive_average: 31.92 - ARC_Prize_Public_Evaluation_accuracy: 0.01 + TheoremQA_score: 22.25 + musr_average_naive_average: 50.89 + korbench_single_naive_average: 32.16 + ARC_Prize_Public_Evaluation_accuracy: 0.02 gsm8k_accuracy: 86.73 - GaokaoBench_weighted_average: 77.89 - math_accuracy: 61.5 - cmo_fib_accuracy: 12.5 - aime2024_accuracy: 3.33 - Mathbench_naive_average: 65.17 - wikibench-wiki-single_choice_cncircular_perf_4: 31.55 - cmmlu_naive_average: 74.14 - mmlu_naive_average: 70.52 + GaokaoBench_weighted_average: 78.6 + math_accuracy: 61 + cmo_fib_accuracy: 11 + aime2024_accuracy: 6.67 + Mathbench_naive_average: 64.23 + wikibench-wiki-single_choice_cncircular_perf_4: 31.32 + cmmlu_naive_average: 74.3 + mmlu_naive_average: 70.84 mmlu_pro_naive_average: 44.98 - openai_humaneval_humaneval_pass@1: 70.73 - sanitized_mbpp_score: 63.81 - humanevalx_naive_average: 38.17 + openai_humaneval_humaneval_pass@1: 69.8 + sanitized_mbpp_score: 64.4 + humanevalx_naive_average: 33.35 ds1000_naive_average: 14.15 lcb_code_generation_pass@1: 17.75 lcb_code_execution_pass@1: 32.57 - lcb_test_output_pass@1: 24.89 - bigcodebench_hard_instruct_pass@1: 0.08 - bigcodebench_hard_complete_pass@1: 0.06 - teval_naive_average: 80.03 + lcb_test_output_pass@1: 26.13 + bigcodebench_hard_instruct_pass@1: 8.45 + bigcodebench_hard_complete_pass@1: 5.06 + teval_naive_average: 80 + SciCode_sub_accuracy: 5.56 qa_dingo_cn_score: 99.01 mmlu-stem_naive_average: 68.2 - mmlu-social-science_naive_average: 76.11 - mmlu-humanities_naive_average: 68.71 - mmlu-other_naive_average: 70.56 - cmmlu-stem_naive_average: 66.27 - cmmlu-social-science_naive_average: 75.7 - cmmlu-humanities_naive_average: 77.7 - cmmlu-other_naive_average: 77.71 - cmmlu-china-specific_naive_average: 72.94 - mmlu_pro_biology_accuracy: 66.25 - mmlu_pro_business_accuracy: 48.42 - mmlu_pro_chemistry_accuracy: 35.25 - mmlu_pro_computer_science_accuracy: 47.56 - mmlu_pro_economics_accuracy: 55.92 - mmlu_pro_engineering_accuracy: 30.44 - mmlu_pro_health_accuracy: 45.97 - mmlu_pro_history_accuracy: 41.21 + mmlu-social-science_naive_average: 75.8 + mmlu-humanities_naive_average: 69.3 + mmlu-other_naive_average: 71.3 + cmmlu-stem_naive_average: 66.64 + cmmlu-social-science_naive_average: 76 + cmmlu-humanities_naive_average: 77.9 + cmmlu-other_naive_average: 77.25 + cmmlu-china-specific_naive_average: 73.6 + mmlu_pro_biology_accuracy: 66.67 + mmlu_pro_business_accuracy: 47.91 + mmlu_pro_chemistry_accuracy: 35 + mmlu_pro_computer_science_accuracy: 48.9 + mmlu_pro_economics_accuracy: 55.87 + mmlu_pro_engineering_accuracy: 29.62 + mmlu_pro_health_accuracy: 45 + mmlu_pro_history_accuracy: 40.8 mmlu_pro_law_accuracy: 25.79 - mmlu_pro_math_accuracy: 54.03 - mmlu_pro_philosophy_accuracy: 36.47 - mmlu_pro_physics_accuracy: 37.41 - mmlu_pro_psychology_accuracy: 58.77 - mmlu_pro_other_accuracy: 46.21 + mmlu_pro_math_accuracy: 53.48 + mmlu_pro_philosophy_accuracy: 38.38 + mmlu_pro_physics_accuracy: 37.79 + mmlu_pro_psychology_accuracy: 58.39 + mmlu_pro_other_accuracy: 46.27 humanevalx-python_pass@1: 53.66 - humanevalx-cpp_pass@1: 24.39 + humanevalx-cpp_pass@1: 22.56 humanevalx-go_pass@1: 0 - humanevalx-java_pass@1: 57.93 humanevalx-js_pass@1: 54.88 - ds1000_Pandas_accuracy: 12.03 - ds1000_Numpy_accuracy: 4.09 - ds1000_Tensorflow_accuracy: 11.11 - ds1000_Scipy_accuracy: 8.49 + ds1000_Pandas_accuracy: 10.65 + ds1000_Numpy_accuracy: 3.63 + ds1000_Tensorflow_accuracy: 13.33 + ds1000_Scipy_accuracy: 8.96 ds1000_Sklearn_accuracy: 6.96 - ds1000_Pytorch_accuracy: 7.35 - ds1000_Matplotlib_accuracy: 49.03 - openai_mmmlu_lite_AR-XY_accuracy: 17.89 - openai_mmmlu_lite_BN-BD_accuracy: 27.58 - openai_mmmlu_lite_DE-DE_accuracy: 51.16 - openai_mmmlu_lite_ES-LA_accuracy: 56.84 - openai_mmmlu_lite_FR-FR_accuracy: 57.96 - openai_mmmlu_lite_HI-IN_accuracy: 33.68 - openai_mmmlu_lite_ID-ID_accuracy: 51.02 - openai_mmmlu_lite_IT-IT_accuracy: 50.46 - openai_mmmlu_lite_JA-JP_accuracy: 50.53 - openai_mmmlu_lite_KO-KR_accuracy: 45.05 + ds1000_Pytorch_accuracy: 6.62 + ds1000_Matplotlib_accuracy: 49.35 + openai_mmmlu_lite_AR-XY_accuracy: 17.19 + openai_mmmlu_lite_BN-BD_accuracy: 26.78 + openai_mmmlu_lite_DE-DE_accuracy: 51.27 + openai_mmmlu_lite_ES-LA_accuracy: 56.94 + openai_mmmlu_lite_FR-FR_accuracy: 58.22 + openai_mmmlu_lite_HI-IN_accuracy: 33.75 + openai_mmmlu_lite_ID-ID_accuracy: 50.6 + openai_mmmlu_lite_IT-IT_accuracy: 50.6 + openai_mmmlu_lite_JA-JP_accuracy: 51.13 + openai_mmmlu_lite_KO-KR_accuracy: 45 openai_mmmlu_lite_PT-BR_accuracy: 57.68 - openai_mmmlu_lite_SW-KE_accuracy: 32.77 - openai_mmmlu_lite_YO-NG_accuracy: 31.79 - openai_mmmlu_lite_ZH-CN_accuracy: 65.05 - college_naive_average: 20.33 - high_naive_average: 47.67 - middle_naive_average: 62 - primary_naive_average: 72 - arithmetic_naive_average: 62.33 - mathbench-a (average)_naive_average: 52.87 - college_knowledge_naive_average: 70.57 - high_knowledge_naive_average: 70.13 - middle_knowledge_naive_average: 81.17 - primary_knowledge_naive_average: 88.01 - mathbench-t (average)_naive_average: 77.47 + openai_mmmlu_lite_SW-KE_accuracy: 32.56 + openai_mmmlu_lite_YO-NG_accuracy: 32.42 + openai_mmmlu_lite_ZH-CN_accuracy: 65.4 + college_naive_average: 19.17 + high_naive_average: 46.5 + middle_naive_average: 61.34 + primary_naive_average: 73.34 + arithmetic_naive_average: 61.67 + mathbench-a (average)_naive_average: 52.58 + college_knowledge_naive_average: 67.1 + high_knowledge_naive_average: 70 + middle_knowledge_naive_average: 80 + primary_knowledge_naive_average: 87 + mathbench-t (average)_naive_average: 76 subjective: alignment_bench_v1_1_总分: 5.68 alpaca_eval_total: 25.96 @@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind: compassarena_knowledge_naive_average: 36 compassarena_reason_v2_naive_average: 35 compassarena_math_v2_naive_average: 19.91 - compassarena_creationv2_zh_naive_average: 29.64 + compassarena_creationv2_zh_naive_average: 35.81 fofo_test_prompts_overall: 0.35 fofo_test_prompts_cn_overall: 0.41 followbench_llmeval_en_HSR_AVG: 0.73 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index 131fd2ea..45e20ddd 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -63,7 +63,7 @@ chat: gsm8k_accuracy: 84.38 race-high_accuracy: 90.62 llama-3_2-3b-instruct-hf: - gsm8k_accuracy: 68.75 + gsm8k_accuracy: 65.62 race-high_accuracy: 81.25 llama-3-8b-instruct-hf: gsm8k_accuracy: 68.75 @@ -75,7 +75,7 @@ chat: gsm8k_accuracy: 78.12 race-high_accuracy: 90.62 llama-3_2-3b-instruct-turbomind: - gsm8k_accuracy: 71.88 + gsm8k_accuracy: 62.50 race-high_accuracy: 81.25 llama-3-8b-instruct-turbomind: gsm8k_accuracy: 71.88 @@ -98,15 +98,9 @@ chat: mistral-7b-instruct-v0.2-vllm: gsm8k_accuracy: 43.75 race-high_accuracy: 75 - MiniCPM3-4B-hf: - gsm8k_accuracy: 68.75 - race-high_accuracy: 84.38 phi-3-mini-4k-instruct-hf: - gsm8k_accuracy: 56.25 - race-high_accuracy: 84.38 - phi-3-small-8k-instruct-hf: - gsm8k_accuracy: 0 - race-high_accuracy: 0 + gsm8k_accuracy: 81.25 + race-high_accuracy: 87.50 qwen2.5-0.5b-instruct-hf: gsm8k_accuracy: 34.38 race-high_accuracy: 46.88 @@ -321,21 +315,11 @@ base: GPQA_diamond_accuracy: 12.50 race-high_accuracy: 65.62 winogrande_accuracy: 78.12 - mistral-7b-v0.2-hf: - gsm8k_accuracy: 31.25 - GPQA_diamond_accuracy: 6.25 - race-high_accuracy: 62.5 - winogrande_accuracy: 59.38 mistral-7b-v0.3-hf: gsm8k_accuracy: 31.25 GPQA_diamond_accuracy: 6.25 race-high_accuracy: 62.5 winogrande_accuracy: 59.38 - mistral-7b-v0.2-vllm: - gsm8k_accuracy: 34.38 - GPQA_diamond_accuracy: 6.25 - race-high_accuracy: 62.5 - winogrande_accuracy: 65.62 qwen2.5-7b-hf: gsm8k_accuracy: 81.25 GPQA_diamond_accuracy: 18.75 @@ -457,10 +441,10 @@ base: race-high_accuracy: 93.75 winogrande_accuracy: 87.5 deepseek-v2-turbomind: - gsm8k_accuracy: 62.5 + gsm8k_accuracy: 71.88 GPQA_diamond_accuracy: 3.12 race-high_accuracy: 81.25 - winogrande_accuracy: 68.75 + winogrande_accuracy: 75 llama-3-70b-hf: gsm8k_accuracy: 62.5 GPQA_diamond_accuracy: 3.12 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 6c308c56..5bd1bdac 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -92,6 +92,7 @@ jobs: matrix: pyver: [py310] runs-on: ubuntu-latest + environment: 'prod' env: PYTHON_VERSION: ${{ matrix.pyver }} PLAT_NAME: manylinux2014_x86_64 @@ -187,7 +188,7 @@ jobs: regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}} runs-on: volc_cu12_daily environment: 'prod' - timeout-minutes: 240 #4hours + timeout-minutes: 120 #2hours steps: - name: Clone repository uses: actions/checkout@v2 @@ -321,7 +322,7 @@ jobs: uses: nick-fields/retry@v3 with: max_attempts: 1 - timeout_minutes: 240 + timeout_minutes: 360 command: | . ${{env.CONDA_PATH}}/bin/activate conda activate ${{env.CONDA_ENV}} @@ -335,7 +336,6 @@ jobs: notify_to_feishu: if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test] - environment: 'prod' timeout-minutes: 5 runs-on: self-hosted steps: diff --git a/.github/workflows/pr-run-test-prod.yml b/.github/workflows/pr-run-test-prod.yml new file mode 100644 index 00000000..8dade784 --- /dev/null +++ b/.github/workflows/pr-run-test-prod.yml @@ -0,0 +1,100 @@ +name: pr_run_test-prod + +on: + pull_request: + paths: + - '.github/**' + workflow_dispatch: + schedule: + - cron: '56 22 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + CONDA_ENV: pr_test + HF_DATASETS_OFFLINE: 1 + HF_EVALUATE_OFFLINE: 1 + TRANSFORMERS_OFFLINE: 1 + VLLM_USE_MODELSCOPE: false + LMDEPLOY_USE_MODELSCOPE: false + HF_HUB_OFFLINE: 1 + CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3 + PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip + REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/prtest + COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache + HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub + HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub + +jobs: + pr_run_test: + runs-on: volc_cu12_local + environment: 'prod' + timeout-minutes: 30 + steps: + - name: Checkout repository + uses: actions/checkout@v2 + - name: Prepare - Install opencompass + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + python3 -m pip uninstall opencompass -y + python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}} + conda info --envs + - name: conda env + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + pip list + lmdeploy check_env + - name: Run test + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + rm -rf regression_result + opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1 --debug + opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2 --debug --max-num-workers 2 + opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3 --debug --max-num-workers 2 + - name: Get result + run: | + score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}') + if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then + echo "score is $score between 88 and 89" + else + echo "score is $score not between 88 and 89" + exit 1 + fi + score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}') + if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then + echo "score is $score between 87 and 88" + else + echo "score is $score not between 87 and 88" + exit 1 + fi + score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}') + if (( ${score%.*} >= 87 && ${score%.*} <= 91 )); then + echo "score is $score between 87 and 91" + else + echo "score is $score not between 87 and 91" + exit 1 + fi + - name: Uninstall opencompass + if: always() + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + python3 -m pip uninstall opencompass -y + conda info --envs + + notify_to_feishu: + if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} + needs: [pr_run_test] + timeout-minutes: 5 + runs-on: self-hosted + steps: + - name: notify + run: | + curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }} diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml index dfd9a7e5..3e1b6782 100644 --- a/.github/workflows/pr-run-test.yml +++ b/.github/workflows/pr-run-test.yml @@ -8,10 +8,9 @@ on: - 'docs/**' - 'configs/**' - 'tools/**' + paths: + - '!.github/**' - workflow_dispatch: - schedule: - - cron: '56 22 * * *' concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -35,7 +34,6 @@ env: jobs: pr_run_test: runs-on: volc_cu12_local - environment: 'prod' timeout-minutes: 30 steps: - name: Checkout repository @@ -97,7 +95,6 @@ jobs: notify_to_feishu: if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} needs: [pr_run_test] - environment: 'prod' timeout-minutes: 5 runs-on: self-hosted steps: