mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add daily test case (#864)
* add daily test case * Update pr-run-test.yml * Update daily-run-test.yml * Update daily-run-test.yml * Update pr-run-test.yml --------- Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
This commit is contained in:
parent
4c87e777d8
commit
0919b08ec8
93
.github/scripts/oc_score_assert.py
vendored
Normal file
93
.github/scripts/oc_score_assert.py
vendored
Normal file
@ -0,0 +1,93 @@
|
||||
import csv
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
output_path = 'regression_result_daily'
|
||||
|
||||
model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf']
|
||||
dataset_list = ['ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval']
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def baseline_scores(request):
|
||||
config_path = os.path.join(request.config.rootdir,
|
||||
'.github/scripts/oc_score_baseline.yaml')
|
||||
with open(config_path) as f:
|
||||
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
||||
return config
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def result_scores():
|
||||
file = find_csv_files(output_path)
|
||||
if file is None:
|
||||
return None
|
||||
return read_csv_file(file)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@pytest.mark.usefixtures('baseline_scores')
|
||||
class TestChat:
|
||||
"""Test cases for chat model."""
|
||||
|
||||
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list
|
||||
for p2 in dataset_list])
|
||||
def test_demo_default(self, baseline_scores, result_scores, model,
|
||||
dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(result_score, base_score)
|
||||
|
||||
|
||||
def assert_score(score, baseline):
|
||||
if score is None or score == '-':
|
||||
assert False, 'value is none'
|
||||
if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97):
|
||||
print(score + ' between ' + str(baseline * 0.97) + ' and ' +
|
||||
str(baseline * 1.03))
|
||||
assert True
|
||||
else:
|
||||
assert False, score + ' not between ' + str(
|
||||
baseline * 0.97) + ' and ' + str(baseline * 1.03)
|
||||
|
||||
|
||||
def find_csv_files(directory):
|
||||
csv_files = []
|
||||
for root, dirs, files in os.walk(directory):
|
||||
for file in files:
|
||||
if file.endswith('.csv'):
|
||||
csv_files.append(os.path.join(root, file))
|
||||
if len(csv_files) > 1:
|
||||
raise 'have more than 1 result file, please check the result manually'
|
||||
if len(csv_files) == 0:
|
||||
return None
|
||||
return csv_files[0]
|
||||
|
||||
|
||||
def read_csv_file(file_path):
|
||||
with open(file_path, 'r') as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
filtered_data = []
|
||||
|
||||
for row in reader:
|
||||
filtered_row = {
|
||||
k: v
|
||||
for k, v in row.items()
|
||||
if k not in ['version', 'metric', 'mode']
|
||||
}
|
||||
filtered_data.append(filtered_row)
|
||||
|
||||
result = {}
|
||||
for data in filtered_data:
|
||||
dataset = data.get('dataset')
|
||||
for key in data.keys():
|
||||
if key == 'dataset':
|
||||
continue
|
||||
else:
|
||||
if key in result.keys():
|
||||
result.get(key)[dataset] = data.get(key)
|
||||
else:
|
||||
result[key] = {dataset: data.get(key)}
|
||||
return result
|
11
.github/scripts/oc_score_baseline.yaml
vendored
Normal file
11
.github/scripts/oc_score_baseline.yaml
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
internlm-7b-hf:
|
||||
ARC-c: 36.27
|
||||
chid-dev: 81.68
|
||||
chid-test: 83.67
|
||||
openai_humaneval: 10.37
|
||||
|
||||
internlm-chat-7b-hf:
|
||||
ARC-c: 36.95
|
||||
chid-dev: 71.78
|
||||
chid-test: 76.87
|
||||
openai_humaneval: 21.34
|
74
.github/workflows/daily-run-test.yml
vendored
Normal file
74
.github/workflows/daily-run-test.yml
vendored
Normal file
@ -0,0 +1,74 @@
|
||||
name: daily_run_test
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '56 16 * * *'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CONDA_ENV: opencompass_regression
|
||||
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
|
||||
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
|
||||
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
|
||||
|
||||
jobs:
|
||||
daily_run_test:
|
||||
runs-on: self-hosted
|
||||
environment: 'prod'
|
||||
timeout-minutes: 240 #4hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
uses: actions/checkout@v2
|
||||
- name: Prepare - create conda env and install torch
|
||||
run: |
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda create -y --name ${{env.CONDA_ENV}} python=3.10
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
pip install torch torchvision torchaudio --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
|
||||
conda info --envs
|
||||
- name: Prepare - Pip install code
|
||||
run: |
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install human_eval transformers==4.33.0 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
conda info --envs
|
||||
- name: Prepare - prepare data and hf model
|
||||
run: |
|
||||
cp -r ${{env.USERSPACE_PREFIX}}/data .
|
||||
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
|
||||
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
|
||||
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
|
||||
- name: Run test
|
||||
run: |
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
conda info --envs
|
||||
rm -rf regression_result_daily
|
||||
export from_tf=TRUE
|
||||
python3 run.py --models hf_internlm_chat_7b hf_internlm_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl --work-dir regression_result_daily --debug
|
||||
- name: Get result
|
||||
run: |
|
||||
pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pytest -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Remove Conda Env
|
||||
if: always()
|
||||
run: |
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda env remove --name ${{env.CONDA_ENV}}
|
||||
conda info --envs
|
||||
|
||||
notify_to_feishu:
|
||||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
||||
needs: [daily_run_test]
|
||||
environment: 'prod'
|
||||
timeout-minutes: 5
|
||||
runs-on: self-hosted
|
||||
steps:
|
||||
- name: notify
|
||||
run: |
|
||||
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
|
28
.github/workflows/pr-run-test.yml
vendored
28
.github/workflows/pr-run-test.yml
vendored
@ -18,28 +18,30 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CONDA_ENV: opencompass_regression_daily
|
||||
CONDA_ENV: opencompass_base
|
||||
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
|
||||
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
|
||||
|
||||
jobs:
|
||||
pr_run_test:
|
||||
runs-on: self-hosted
|
||||
environment: 'prod'
|
||||
timeout-minutes: 20
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- name: Clone repository
|
||||
uses: actions/checkout@v2
|
||||
- name: Prepare - create conda env and install code
|
||||
- name: Prepare - Install opencompass
|
||||
run: |
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda create --name ${{env.CONDA_ENV}} --clone opencompass_base --offline
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
python3 -m pip install -e .
|
||||
python3 -m pip uninstall opencompass -y
|
||||
python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip
|
||||
conda info --envs
|
||||
- name: Prepare - prepare data and hf model
|
||||
run: |
|
||||
cp -r /cpfs01/user/qa-llm-cicd/data .
|
||||
cp -r ${{env.USERSPACE_PREFIX}}/data .
|
||||
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
|
||||
ln -s /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub ~/.cache/huggingface/hub
|
||||
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
|
||||
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
|
||||
- name: Run test
|
||||
run: |
|
||||
@ -49,21 +51,21 @@ jobs:
|
||||
rm -rf regression_result
|
||||
python3 run.py --models hf_internlm_chat_7b --datasets siqa_gen --work-dir regression_result --debug
|
||||
- name: Get result
|
||||
if: always()
|
||||
run: |
|
||||
score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')
|
||||
if (( ${score%.*} >= 70 && ${score%.*} <= 80 )); then
|
||||
echo "score is $score between 70 and 80"
|
||||
if (( ${score%.*} >= 70 && ${score%.*} <= 75 )); then
|
||||
echo "score is $score between 70 and 75"
|
||||
else
|
||||
echo "score is $score not between 70 and 80"
|
||||
echo "score is $score not between 70 and 75"
|
||||
exit 1
|
||||
fi
|
||||
rm -rf regression_result
|
||||
- name: Remove Conda Env
|
||||
- name: Uninstall opencompass
|
||||
if: always()
|
||||
run: |
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda env remove --name ${{env.CONDA_ENV}}
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
python3 -m pip uninstall opencompass -y
|
||||
conda info --envs
|
||||
|
||||
notify_to_feishu:
|
||||
|
Loading…
Reference in New Issue
Block a user