diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index 52897094..9b175daa 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -6,8 +6,11 @@ import yaml output_path = 'regression_result_daily' -model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf'] -dataset_list = ['ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval'] +model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf', 'chatglm3-6b-base-hf'] +dataset_list = [ + 'ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval', 'openbookqa', + 'openbookqa_fact' +] @pytest.fixture() @@ -34,8 +37,8 @@ class TestChat: @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list for p2 in dataset_list]) - def test_demo_default(self, baseline_scores, result_scores, model, - dataset): + def test_model_dataset_score(self, baseline_scores, result_scores, model, + dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) assert_score(result_score, base_score) diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index e80d2df9..6e249541 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -3,9 +3,21 @@ internlm-7b-hf: chid-dev: 81.68 chid-test: 83.67 openai_humaneval: 10.37 + openbookqa: 44.4 + openbookqa_fact: 73.2 internlm-chat-7b-hf: ARC-c: 36.95 chid-dev: 71.78 chid-test: 76.87 openai_humaneval: 21.34 + openbookqa: 66.6 + openbookqa_fact: 80.4 + +chatglm3-6b-base-hf: + ARC-c: 43.05 + chid-dev: 80.2 + chid-test: 80.77 + openai_humaneval: 20.73 + openbookqa: 79.8 + openbookqa_fact: 92.2 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 232852af..922bf433 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -50,11 +50,12 @@ jobs: conda info --envs rm -rf regression_result_daily export from_tf=TRUE - python3 run.py --models hf_internlm_chat_7b hf_internlm_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl --work-dir regression_result_daily --debug + python3 run.py --models hf_internlm_chat_7b hf_internlm_7b hf_chatglm3_6b_base hf_chatglm3_6b hf_qwen_7b_chat hf_qwen_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl obqa_ppl --work-dir regression_result_daily - name: Get result run: | + eval "$(conda shell.bash hook)" pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}} - pytest -s -v --color=yes .github/scripts/oc_score_assert.py + python -m pytest -s -v --color=yes .github/scripts/oc_score_assert.py - name: Remove Conda Env if: always() run: | @@ -71,4 +72,4 @@ jobs: steps: - name: notify run: | - curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }} + curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- Daily test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}