From 94eb90569f63800c237f32a35385dff93b43f13a Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Thu, 16 May 2024 15:32:57 +0800 Subject: [PATCH] update test workflow (#1167) * Update pr-run-test.yml * Update daily-run-test.yml * Update daily-run-test.yml * Update pr-run-test.yml * Update daily-run-test.yml * Update daily-run-test.yml * Update daily-run-test.yml * Update daily-run-test.yml * Update oc_score_baseline.yaml * Update daily-run-test.yml * Update oc_score_assert.py --------- Co-authored-by: zhulin1 --- .github/scripts/oc_score_assert.py | 2 +- .github/scripts/oc_score_baseline.yaml | 30 ++++++++++++++++---------- .github/workflows/daily-run-test.yml | 6 +++--- .github/workflows/pr-run-test.yml | 6 +++--- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index 9b175daa..7ca0a7be 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -6,7 +6,7 @@ import yaml output_path = 'regression_result_daily' -model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf', 'chatglm3-6b-base-hf'] +model_list = ['internlm2-7b-hf', 'internlm-chat-7b-hf', 'chatglm3-6b-base-hf'] dataset_list = [ 'ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval', 'openbookqa', 'openbookqa_fact' diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index 6e249541..b3b86f67 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -1,10 +1,10 @@ internlm-7b-hf: - ARC-c: 36.27 - chid-dev: 81.68 - chid-test: 83.67 - openai_humaneval: 10.37 - openbookqa: 44.4 - openbookqa_fact: 73.2 + ARC-c: 34.24 + chid-dev: 79.70 + chid-test: 81.12 + openai_humaneval: 10.98 + openbookqa: 47.20 + openbookqa_fact: 74.00 internlm-chat-7b-hf: ARC-c: 36.95 @@ -15,9 +15,17 @@ internlm-chat-7b-hf: openbookqa_fact: 80.4 chatglm3-6b-base-hf: - ARC-c: 43.05 - chid-dev: 80.2 - chid-test: 80.77 + ARC-c: 44.41 + chid-dev: 78.22 + chid-test: 78.57 openai_humaneval: 20.73 - openbookqa: 79.8 - openbookqa_fact: 92.2 + openbookqa: 78.40 + openbookqa_fact: 92.00 + +internlm2-7b-hf: + ARC-c: 34.92 + chid-dev: 55.94 + chid-test: 53.70 + openai_humaneval: 44.51 + openbookqa: 83.00 + openbookqa_fact: 83.00 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 1b887b23..6d5f582b 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -45,7 +45,6 @@ jobs: cp -r ${{env.USERSPACE_PREFIX}}/data . rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub - export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_HUB_OFFLINE=1; - name: Run test run: | eval "$(conda shell.bash hook)" @@ -53,7 +52,7 @@ jobs: conda info --envs rm -rf regression_result_daily export from_tf=TRUE - python3 run.py --models hf_internlm_chat_7b hf_internlm_7b hf_chatglm3_6b_base hf_chatglm3_6b hf_qwen_7b_chat hf_qwen_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl obqa_ppl --work-dir regression_result_daily + python3 run.py --models hf_internlm_chat_7b hf_internlm2_7b hf_chatglm3_6b_base hf_chatglm3_6b hf_qwen_7b_chat hf_qwen_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl obqa_ppl --work-dir regression_result_daily - name: Get result run: | eval "$(conda shell.bash hook)" @@ -62,8 +61,9 @@ jobs: - name: Remove Conda Env if: always() run: | + cp -r regression_result_daily/* /cpfs01/user/qa-llm-cicd/report eval "$(conda shell.bash hook)" - conda env remove --name ${{env.CONDA_ENV}} + conda env remove -y --name ${{env.CONDA_ENV}} conda info --envs notify_to_feishu: diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml index a754c4aa..cd3399cf 100644 --- a/.github/workflows/pr-run-test.yml +++ b/.github/workflows/pr-run-test.yml @@ -55,10 +55,10 @@ jobs: - name: Get result run: | score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}') - if (( ${score%.*} >= 75 && ${score%.*} <= 85 )); then - echo "score is $score between 75 and 85" + if (( ${score%.*} >= 79 && ${score%.*} <= 81 )); then + echo "score is $score between 79 and 81" else - echo "score is $score not between 75 and 85" + echo "score is $score not between 79 and 81" exit 1 fi rm -rf regression_result