mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
update
This commit is contained in:
parent
9d63fdd616
commit
035a7cee0e
24
.github/scripts/oc_score_assert.py
vendored
24
.github/scripts/oc_score_assert.py
vendored
@ -258,9 +258,9 @@ class TestCmdCase:
|
||||
[('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||
('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
@ -278,13 +278,23 @@ class TestCmdCase:
|
||||
|
||||
@pytest.mark.case4
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
|
||||
('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
|
||||
('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
|
||||
'model, dataset', [('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score, dataset)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
@pytest.mark.case5
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset', [('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
|
||||
('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
|
||||
('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
|
||||
def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
|
||||
def assert_score(model_type, score, baseline, dataset: str = ''):
|
||||
|
20
.github/scripts/oc_score_baseline.yaml
vendored
20
.github/scripts/oc_score_baseline.yaml
vendored
@ -9,14 +9,24 @@ internlm2_5-7b_hf:
|
||||
race-high_accuracy: 90.02
|
||||
|
||||
internlm2_5-7b-chat-lmdeploy:
|
||||
demo_gsm8k_accuracy: 89.06
|
||||
demo_gsm8k_accuracy: 87.50
|
||||
race-middle_accuracy: 92.76
|
||||
race-high_accuracy: 90.54
|
||||
|
||||
internlm2-chat-1.8b-lmdeploy:
|
||||
demo_gsm8k_accuracy: 31
|
||||
race-middle_accuracy: 81.34
|
||||
race-high_accuracy: 73.96
|
||||
internlm3-8b-instruct-lmdeploy:
|
||||
demo_gsm8k_accuracy: 73.44
|
||||
race-middle_accuracy: 93.38
|
||||
race-high_accuracy: 90.34
|
||||
|
||||
internlm3-8b-instruct_hf-lmdeploy:
|
||||
demo_gsm8k_accuracy: 73.44
|
||||
race-middle_accuracy: 93.38
|
||||
race-high_accuracy: 90.34
|
||||
|
||||
internlm3-8b-instruct_hf-vllm:
|
||||
demo_gsm8k_accuracy: 81.25
|
||||
race-middle_accuracy: 92.20
|
||||
race-high_accuracy: 89.88
|
||||
|
||||
internlm2_5-7b-chat_hf:
|
||||
demo_gsm8k_accuracy: 87.50
|
||||
|
7
.github/workflows/daily-run-test.yml
vendored
7
.github/workflows/daily-run-test.yml
vendored
@ -61,6 +61,7 @@ env:
|
||||
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||
CONDA_ENV: regression_test
|
||||
export VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
|
||||
jobs:
|
||||
build-pypi:
|
||||
@ -263,10 +264,10 @@ jobs:
|
||||
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
|
||||
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a lmdeploy--work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
|
||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
|
||||
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
|
||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
|
||||
python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Run model test - api
|
||||
@ -275,7 +276,7 @@ jobs:
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
conda info --envs
|
||||
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
||||
lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
||||
echo "restful_pid=$!" >> "$GITHUB_ENV"
|
||||
sleep 180s
|
||||
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
|
||||
|
Loading…
Reference in New Issue
Block a user