Update daily test (#871)

* add daily test case

* Update pr-run-test.yml

* Update daily-run-test.yml

* Update daily-run-test.yml

* Update pr-run-test.yml

* Update daily-run-test.yml

* Update oc_score_assert.py

* Update daily-run-test.yml

* Update daily-run-test.yml

* Update daily-run-test.yml

* update testcase baseline

* fix test case name

* add more models into daily test

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
Co-authored-by: Leymore <zfz-960727@163.com>
This commit is contained in:
zhulinJulia24 2024-02-05 15:52:00 +08:00 committed by GitHub
parent fc84aff963
commit b4a9acd7be
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 23 additions and 7 deletions

View File

@ -6,8 +6,11 @@ import yaml
output_path = 'regression_result_daily'
model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf']
dataset_list = ['ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval']
model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf', 'chatglm3-6b-base-hf']
dataset_list = [
'ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval', 'openbookqa',
'openbookqa_fact'
]
@pytest.fixture()
@ -34,8 +37,8 @@ class TestChat:
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list
for p2 in dataset_list])
def test_demo_default(self, baseline_scores, result_scores, model,
dataset):
def test_model_dataset_score(self, baseline_scores, result_scores, model,
dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, base_score)

View File

@ -3,9 +3,21 @@ internlm-7b-hf:
chid-dev: 81.68
chid-test: 83.67
openai_humaneval: 10.37
openbookqa: 44.4
openbookqa_fact: 73.2
internlm-chat-7b-hf:
ARC-c: 36.95
chid-dev: 71.78
chid-test: 76.87
openai_humaneval: 21.34
openbookqa: 66.6
openbookqa_fact: 80.4
chatglm3-6b-base-hf:
ARC-c: 43.05
chid-dev: 80.2
chid-test: 80.77
openai_humaneval: 20.73
openbookqa: 79.8
openbookqa_fact: 92.2

View File

@ -50,11 +50,12 @@ jobs:
conda info --envs
rm -rf regression_result_daily
export from_tf=TRUE
python3 run.py --models hf_internlm_chat_7b hf_internlm_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl --work-dir regression_result_daily --debug
python3 run.py --models hf_internlm_chat_7b hf_internlm_7b hf_chatglm3_6b_base hf_chatglm3_6b hf_qwen_7b_chat hf_qwen_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl obqa_ppl --work-dir regression_result_daily
- name: Get result
run: |
eval "$(conda shell.bash hook)"
pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}}
pytest -s -v --color=yes .github/scripts/oc_score_assert.py
python -m pytest -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Remove Conda Env
if: always()
run: |
@ -71,4 +72,4 @@ jobs:
steps:
- name: notify
run: |
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- Daily test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}