diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index d8c778b7..4ff30916 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -258,9 +258,9 @@ class TestCmdCase: [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'), ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'), ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'), - ('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'), - ('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'), - ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')]) + ('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'), + ('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'), + ('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')]) def test_cmd_case2(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) @@ -278,13 +278,23 @@ class TestCmdCase: @pytest.mark.case4 @pytest.mark.parametrize( - 'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'), - ('internlm2_5-7b-chat_hf', 'race-high_accuracy'), - ('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')]) + 'model, dataset', [('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'), + ('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'), + ('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')]) def test_cmd_case4(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model, result_score, base_score, dataset) + assert_score(model + '_batch', result_score, base_score, dataset) + + @pytest.mark.case5 + @pytest.mark.parametrize( + 'model, dataset', [('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'), + ('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'), + ('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')]) + def test_cmd_case5(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score, dataset) def assert_score(model_type, score, baseline, dataset: str = ''): diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index 9cf6781e..666a2d21 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -9,14 +9,24 @@ internlm2_5-7b_hf: race-high_accuracy: 90.02 internlm2_5-7b-chat-lmdeploy: - demo_gsm8k_accuracy: 89.06 + demo_gsm8k_accuracy: 87.50 race-middle_accuracy: 92.76 race-high_accuracy: 90.54 -internlm2-chat-1.8b-lmdeploy: - demo_gsm8k_accuracy: 31 - race-middle_accuracy: 81.34 - race-high_accuracy: 73.96 +internlm3-8b-instruct-lmdeploy: + demo_gsm8k_accuracy: 73.44 + race-middle_accuracy: 93.38 + race-high_accuracy: 90.34 + +internlm3-8b-instruct_hf-lmdeploy: + demo_gsm8k_accuracy: 73.44 + race-middle_accuracy: 93.38 + race-high_accuracy: 90.34 + +internlm3-8b-instruct_hf-vllm: + demo_gsm8k_accuracy: 81.25 + race-middle_accuracy: 92.20 + race-high_accuracy: 89.88 internlm2_5-7b-chat_hf: demo_gsm8k_accuracy: 87.50 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 4e9594d1..b3e7127d 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -61,6 +61,7 @@ env: HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub CONDA_ENV: regression_test + export VLLM_WORKER_MULTIPROC_METHOD: spawn jobs: build-pypi: @@ -263,10 +264,10 @@ jobs: opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a lmdeploy--work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details + opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details + opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run model test - api @@ -275,7 +276,7 @@ jobs: . ${{env.CONDA_PATH}}/bin/activate conda activate ${{env.CONDA_ENV}} conda info --envs - lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 & + lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 & echo "restful_pid=$!" >> "$GITHUB_ENV" sleep 180s opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details