diff --git a/.github/scripts/eval_regression_base.py b/.github/scripts/eval_regression_base.py new file mode 100644 index 00000000..0d252380 --- /dev/null +++ b/.github/scripts/eval_regression_base.py @@ -0,0 +1,52 @@ +from mmengine.config import read_base + +with read_base(): + # choose a list of datasets + from ...configs.datasets.gsm8k.gsm8k_gen_17d0dc import \ + gsm8k_datasets # noqa: F401, E501 + from ...configs.datasets.race.race_ppl import \ + race_datasets # noqa: F401, E501 + from ...configs.models.deepseek.hf_deepseek_moe_16b_base import \ + models as hf_deepseek_moe_16b_base_model # noqa: F401, E501 + # read hf models - chat models + from ...configs.models.deepseek.lmdeploy_deepseek_7b_base import \ + models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 + from ...configs.models.deepseek.vllm_deepseek_moe_16b_base import \ + models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501 + from ...configs.models.gemma.hf_gemma_2b import \ + models as hf_gemma_2b_model # noqa: F401, E501 + from ...configs.models.gemma.hf_gemma_7b import \ + models as hf_gemma_7b_model # noqa: F401, E501 + from ...configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \ + models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501 + from ...configs.models.hf_internlm.lmdeploy_internlm2_7b import \ + models as lmdeploy_internlm2_7b_model # noqa: F401, E501 + from ...configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \ + models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501 + from ...configs.models.hf_llama.lmdeploy_llama3_8b import \ + models as lmdeploy_llama3_8b_model # noqa: F401, E501 + from ...configs.models.mistral.hf_mistral_7b_v0_2 import \ + models as hf_mistral_7b_v0_2_model # noqa: F401, E501 + from ...configs.models.mistral.vllm_mistral_7b_v0_2 import \ + models as vllm_mistral_7b_v0_2_model # noqa: F401, E501 + from ...configs.models.qwen.hf_qwen1_5_moe_a2_7b import \ + models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501 + from ...configs.models.qwen.hf_qwen2_0_5b import \ + models as hf_qwen2_0_5b_model # noqa: F401, E501 + from ...configs.models.qwen.lmdeploy_qwen2_1_5b import \ + models as lmdeploy_qwen2_1_5b_model # noqa: F401, E501 + from ...configs.models.qwen.lmdeploy_qwen2_7b import \ + models as lmdeploy_qwen2_7b_model # noqa: F401, E501 + from ...configs.models.qwen.vllm_qwen1_5_0_5b import \ + models as vllm_qwen1_5_0_5b_model # noqa: F401, E501 + from ...configs.models.yi.hf_yi_1_5_6b import \ + models as hf_yi_1_5_6b_model # noqa: F401, E501 + from ...configs.models.yi.hf_yi_1_5_9b import \ + models as hf_yi_1_5_9b_model # noqa: F401, E501 + from ...configs.summarizers.medium import summarizer # noqa: F401, E501 + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:100]' diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat.py new file mode 100644 index 00000000..8e29834d --- /dev/null +++ b/.github/scripts/eval_regression_chat.py @@ -0,0 +1,70 @@ +from mmengine.config import read_base + +with read_base(): + # choose a list of datasets + from ...configs.datasets.gsm8k.gsm8k_gen import \ + gsm8k_datasets # noqa: F401, E501 + from ...configs.datasets.race.race_gen import \ + race_datasets # noqa: F401, E501 + # read hf models - chat models + from ...configs.models.baichuan.hf_baichuan2_7b_chat import \ + models as hf_baichuan2_7b_chat_model # noqa: F401, E501 + from ...configs.models.chatglm.hf_glm4_9b_chat import \ + models as hf_glm4_9b_chat_model # noqa: F401, E501 + from ...configs.models.deepseek.hf_deepseek_7b_chat import \ + models as hf_deepseek_7b_chat_model # noqa: F401, E501 + from ...configs.models.deepseek.hf_deepseek_moe_16b_chat import \ + models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501 + from ...configs.models.deepseek.vllm_deepseek_7b_chat import \ + models as vllm_deepseek_7b_chat_model # noqa: F401, E501 + from ...configs.models.gemma.hf_gemma_2b_it import \ + models as hf_gemma_2b_it_model # noqa: F401, E501 + from ...configs.models.gemma.hf_gemma_7b_it import \ + models as hf_gemma_7b_it_model # noqa: F401, E501 + from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \ + models as lmdeploy_internlm2_chat_1_8b_model # noqa: F401, E501 + from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \ + models as lmdeploy_internlm2_chat_1_8b_sft_model # noqa: F401, E501 + from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \ + models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501 + from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \ + models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501 + from ...configs.models.hf_internlm.vllm_internlm2_chat_7b import \ + models as vllm_internlm2_chat_7b_model # noqa: F401, E501 + from ...configs.models.hf_llama.hf_llama3_8b_instruct import \ + models as hf_llama3_8b_instruct_model # noqa: F401, E501 + from ...configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \ + models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501 + from ...configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \ + models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501 + from ...configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \ + models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501 + from ...configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \ + models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501 + from ...configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \ + models as hf_minicpm_2b_sft_bf16_model # noqa: F401, E501 + from ...configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \ + models as hf_minicpm_2b_sft_fp32_model # noqa: F401, E501 + from ...configs.models.phi.hf_phi_3_mini_4k_instruct import \ + models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501 + from ...configs.models.phi.hf_phi_3_small_8k_instruct import \ + models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501 + from ...configs.models.qwen.hf_qwen1_5_0_5b_chat import \ + models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501 + from ...configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \ + models as lmdeploy_qwen2_1_5b_instruct_model # noqa: F401, E501 + from ...configs.models.qwen.lmdeploy_qwen2_7b_instruct import \ + models as lmdeploy_qwen2_7b_instruct_model # noqa: F401, E501 + from ...configs.models.qwen.vllm_qwen1_5_0_5b_chat import \ + models as vllm_qwen1_5_0_5b_chat_model # noqa: F401, E501 + from ...configs.models.yi.hf_yi_1_5_6b_chat import \ + models as hf_yi_1_5_6b_chat_model # noqa: F401, E501 + from ...configs.models.yi.hf_yi_1_5_9b_chat import \ + models as hf_yi_1_5_9b_chat_model # noqa: F401, E501 + from ...configs.summarizers.medium import summarizer # noqa: F401, E501 + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:100]' diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index 7ca0a7be..a9b7691b 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -6,11 +6,26 @@ import yaml output_path = 'regression_result_daily' -model_list = ['internlm2-7b-hf', 'internlm-chat-7b-hf', 'chatglm3-6b-base-hf'] -dataset_list = [ - 'ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval', 'openbookqa', - 'openbookqa_fact' +chat_model_list = [ + 'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', + 'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2-chat-1.8b-turbomind', + 'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind', + 'internlm2-chat-7b-sft-turbomind', 'llama-3-8b-instruct-hf', + 'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf', + 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', + 'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf', + 'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-turbomind', + 'qwen2-7b-instruct-turbomind', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf' ] +base_model_list = [ + 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', 'gemma-2b-hf', + 'gemma-7b-hf', 'internlm2-1.8b-turbomind', 'internlm2-7b-turbomind', + 'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind', + 'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf', + 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind', 'yi-1.5-6b-hf', + 'yi-1.5-9b-hf' +] +dataset_list = ['gsm8k', 'race-middle', 'race-high'] @pytest.fixture() @@ -32,10 +47,28 @@ def result_scores(): @pytest.mark.usefixtures('result_scores') @pytest.mark.usefixtures('baseline_scores') +@pytest.mark.chat class TestChat: """Test cases for chat model.""" - @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list + @pytest.mark.parametrize('model, dataset', [(p1, p2) + for p1 in chat_model_list + for p2 in dataset_list]) + def test_model_dataset_score(self, baseline_scores, result_scores, model, + dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(result_score, base_score) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores') +@pytest.mark.base +class TestBase: + """Test cases for base model.""" + + @pytest.mark.parametrize('model, dataset', [(p1, p2) + for p1 in base_model_list for p2 in dataset_list]) def test_model_dataset_score(self, baseline_scores, result_scores, model, dataset): @@ -47,13 +80,13 @@ class TestChat: def assert_score(score, baseline): if score is None or score == '-': assert False, 'value is none' - if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97): - print(score + ' between ' + str(baseline * 0.97) + ' and ' + - str(baseline * 1.03)) + if float(score) <= (baseline + 5) and float(score) >= (baseline - 5): + print(score + ' between ' + str(baseline - 5) + ' and ' + + str(baseline + 5)) assert True else: assert False, score + ' not between ' + str( - baseline * 0.97) + ' and ' + str(baseline * 1.03) + baseline - 5) + ' and ' + str(baseline + 5) def find_csv_files(directory): @@ -62,11 +95,11 @@ def find_csv_files(directory): for file in files: if file.endswith('.csv'): csv_files.append(os.path.join(root, file)) - if len(csv_files) > 1: - raise 'have more than 1 result file, please check the result manually' - if len(csv_files) == 0: - return None - return csv_files[0] + + csv_files_with_time = {f: os.path.getctime(f) for f in csv_files} + sorted_csv_files = sorted(csv_files_with_time.items(), key=lambda x: x[1]) + latest_csv_file = sorted_csv_files[-1][0] + return latest_csv_file def read_csv_file(file_path): diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index c65b2f13..14906442 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -1,31 +1,180 @@ -internlm-7b-hf: - ARC-c: 34.24 - chid-dev: 79.70 - chid-test: 81.12 - openai_humaneval: 10.98 - openbookqa: 47.20 - openbookqa_fact: 74.00 +baichuan2-7b-chat-hf: + gsm8k: 30 + race-middle: 74 + race-high: 79 -internlm-chat-7b-hf: - ARC-c: 36.95 - chid-dev: 71.78 - chid-test: 76.87 - openai_humaneval: 21.34 - openbookqa: 66.6 - openbookqa_fact: 80.4 +deepseek-7b-chat-hf: + gsm8k: 60 + race-middle: 74 + race-high: 80 -chatglm3-6b-base-hf: - ARC-c: 44.41 - chid-dev: 78.22 - chid-test: 78.57 - openai_humaneval: 20.73 - openbookqa: 78.40 - openbookqa_fact: 92.00 +deepseek-moe-16b-chat-hf: + gsm8k: 62 + race-middle: 62 + race-high: 70 -internlm2-7b-hf: - ARC-c: 36.27 - chid-dev: 55.94 - chid-test: 53.70 - openai_humaneval: 45.12 - openbookqa: 80.00 - openbookqa_fact: 86.40 +gemma-2b-it-hf: + gsm8k: 14 + race-middle: 62 + race-high: 52 + +gemma-7b-it-hf: + gsm8k: 39 + race-middle: 74 + race-high: 71 + +internlm2-chat-1.8b-turbomind: + gsm8k: 40 + race-middle: 82 + race-high: 83 + +internlm2-chat-1.8b-sft-turbomind: + gsm8k: 32 + race-middle: 81 + race-high: 83 + +internlm2-chat-7b-turbomind: + gsm8k: 69 + race-middle: 90 + race-high: 88 + +internlm2-chat-7b-sft-turbomind: + gsm8k: 71 + race-middle: 91 + race-high: 92 + +llama-3-8b-instruct-hf: + gsm8k: 77 + race-middle: 85 + race-high: 87 + +llama-3-8b-instruct-turbomind: + gsm8k: 77 + race-middle: 85 + race-high: 89 + +mistral-7b-instruct-v0.2-hf: + gsm8k: 48 + race-middle: 82 + race-high: 78 + +minicpm-2b-dpo-fp32-hf: + gsm8k: 58 + race-middle: 66 + race-high: 74 + +minicpm-2b-sft-bf16-hf: + gsm8k: 58 + race-middle: 75 + race-high: 81 + +minicpm-2b-sft-fp32-hf: + gsm8k: 58 + race-middle: 75 + race-high: 81 + +phi-3-mini-4k-instruct-hf: + gsm8k: 67 + race-middle: 81 + race-high: 84 + +qwen1.5-0.5b-chat-hf: + gsm8k: 5 + race-middle: 55 + race-high: 50 + +qwen2-1.5b-instruct-turbomind: + gsm8k: 60 + race-middle: 77 + race-high: 86 + +qwen2-7b-instruct-turbomind: + gsm8k: 88 + race-middle: 87 + race-high: 89 + +yi-1.5-6b-chat-hf: + gsm8k: 72 + race-middle: 88 + race-high: 86 + +yi-1.5-9b-chat-hf: + gsm8k: 81 + race-middle: 89 + race-high: 91 + +deepseek-moe-16b-base-hf: + gsm8k: 25 + race-middle: 35 + race-high: 23 + + +deepseek-7b-base-turbomind: + gsm8k: 21 + race-middle: 42 + race-high: 42 + +gemma-2b-hf: + gsm8k: 19 + race-middle: 33 + race-high: 26 + +gemma-7b-hf: + gsm8k: 65 + race-middle: 59 + race-high: 66 + +internlm2-1.8b-turbomind: + gsm8k: 27 + race-middle: 75 + race-high: 72 + +internlm2-7b-turbomind: + gsm8k: 67 + race-middle: 78 + race-high: 76 + +internlm2-base-7b-turbomind: + gsm8k: 39 + race-middle: 75 + race-high: 81 + +llama-3-8b-turbomind: + gsm8k: 52 + race-middle: 63 + race-high: 70 + +mistral-7b-v0.2-hf: + gsm8k: 43 + race-middle: 42 + race-high: 60 + +qwen1.5-moe-a2.7b-hf: + gsm8k: 64 + race-middle: 78 + race-high: 90 + +qwen2-0.5b-hf: + gsm8k: 35 + race-middle: 52 + race-high: 48 + +qwen2-1.5b-turbomind: + gsm8k: 57 + race-middle: 64 + race-high: 78 + +qwen2-7b-turbomind: + gsm8k: 83 + race-middle: 88 + race-high: 88 + +yi-1.5-6b-hf: + gsm8k: 59 + race-middle: 81 + race-high: 89 + +yi-1.5-9b-hf: + gsm8k: 77 + race-middle: 90 + race-high: 90 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 15f359f2..7e01a3a4 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -31,34 +31,46 @@ jobs: eval "$(conda shell.bash hook)" conda create -y --name ${{env.CONDA_ENV}} python=3.10 conda activate ${{env.CONDA_ENV}} - pip install torch torchvision torchaudio --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 - pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-*.whl + pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 + pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.0.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl --index-url https://download.pytorch.org/whl/cu118 + pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 + pip install xformers==0.0.25.post1 --cache-dir ${{env.PIP_CACHE_PATH}} --extra-index-url https://download.pytorch.org/whl/cu118 conda info --envs - name: Prepare - Pip install code run: | eval "$(conda shell.bash hook)" conda activate ${{env.CONDA_ENV}} pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}} - pip install human_eval transformers protobuf --cache-dir ${{env.PIP_CACHE_PATH}} + pip install human_eval transformers protobuf pytest --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs - name: Prepare - prepare data and hf model run: | cp -r ${{env.USERSPACE_PREFIX}}/data . rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub - - name: Run test + - name: Run chat model test run: | eval "$(conda shell.bash hook)" conda activate ${{env.CONDA_ENV}} conda info --envs rm -rf regression_result_daily export from_tf=TRUE - python3 run.py --models hf_internlm_chat_7b hf_internlm2_7b hf_chatglm3_6b_base hf_chatglm3_6b hf_qwen_7b_chat hf_qwen_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl obqa_ppl --work-dir regression_result_daily - - name: Get result + rm -rf /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary + python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }} --reuse + cp -r /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary regression_result_daily + python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Run base model test run: | eval "$(conda shell.bash hook)" - pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}} - python -m pytest -s -v --color=yes .github/scripts/oc_score_assert.py + conda activate ${{env.CONDA_ENV}} + conda info --envs + rm -rf regression_result_daily + export from_tf=TRUE + rm -rf /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary + python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }} --reuse + cp -r /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary regression_result_daily + python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py - name: Remove Conda Env if: always() run: | diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml index cd3399cf..2fcdf59f 100644 --- a/.github/workflows/pr-run-test.yml +++ b/.github/workflows/pr-run-test.yml @@ -31,7 +31,7 @@ jobs: environment: 'prod' timeout-minutes: 30 steps: - - name: Clone repository + - name: Checkout repository uses: actions/checkout@v2 - name: Prepare - Install opencompass run: | diff --git a/configs/models/deepseek/vllm_deepseek_moe_16b_base.py b/configs/models/deepseek/vllm_deepseek_moe_16b_base.py index 1c6097f5..36fcf5cf 100644 --- a/configs/models/deepseek/vllm_deepseek_moe_16b_base.py +++ b/configs/models/deepseek/vllm_deepseek_moe_16b_base.py @@ -3,7 +3,7 @@ from opencompass.models import VLLM models = [ dict( type=VLLM, - abbr='deepseek-moe-16b-base-hf', + abbr='deepseek-moe-16b-base-vllm', path='deepseek-ai/deepseek-moe-16b-base', model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.6), max_out_len=1024,