mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[ci] update dlc setting (#2112)
This commit is contained in:
parent
aa2b89b6f8
commit
c3779ebfc1
32
.github/workflows/daily-run-test.yml
vendored
32
.github/workflows/daily-run-test.yml
vendored
@ -54,12 +54,14 @@ env:
|
|||||||
LMDEPLOY_USE_MODELSCOPE: false
|
LMDEPLOY_USE_MODELSCOPE: false
|
||||||
HF_HUB_OFFLINE: 1
|
HF_HUB_OFFLINE: 1
|
||||||
OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
|
OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
|
||||||
CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
|
CONDA_PATH: ${{ secrets.WORKSPACE_PREFIX }}/miniconda3
|
||||||
PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
|
PIP_CACHE_PATH: ${{ secrets.WORKSPACE_PREFIX }}/.cache/pip
|
||||||
REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression
|
REPORT_ROOT: ${{ secrets.WORKSPACE_PREFIX }}/eval_report/regression
|
||||||
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
|
COMPASS_DATA_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/datasets/compass_data_cache
|
||||||
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
HUGGINGFACE_HUB_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/models/opencompass_hf_hub
|
||||||
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
HF_HUB_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/models/opencompass_hf_hub
|
||||||
|
HF_DATASETS_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/datasets/hf_datasets_cache
|
||||||
|
HF_ENDPOINT: https://hf-mirror.com
|
||||||
CONDA_ENV: regression_test
|
CONDA_ENV: regression_test
|
||||||
export VLLM_WORKER_MULTIPROC_METHOD: spawn
|
export VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||||
|
|
||||||
@ -140,7 +142,7 @@ jobs:
|
|||||||
- name: Remove Conda Env
|
- name: Remove Conda Env
|
||||||
if: always()
|
if: always()
|
||||||
run: |
|
run: |
|
||||||
. /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
|
. ${{ secrets.WORKSPACE_PREFIX }}/miniconda3/bin/activate
|
||||||
conda env remove -y --name ${{env.CONDA_ENV}}
|
conda env remove -y --name ${{env.CONDA_ENV}}
|
||||||
conda info --envs
|
conda info --envs
|
||||||
- name: Prepare - create conda env and install torch - cu12
|
- name: Prepare - create conda env and install torch - cu12
|
||||||
@ -152,14 +154,14 @@ jobs:
|
|||||||
. ${{env.CONDA_PATH}}/bin/activate
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
conda create -y --name ${{env.CONDA_ENV}} python=3.10
|
conda create -y --name ${{env.CONDA_ENV}} python=3.10
|
||||||
conda activate ${{env.CONDA_ENV}}
|
conda activate ${{env.CONDA_ENV}}
|
||||||
pip install -r /fs-computility/llm/qa-llm-cicd/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install -r ${{ secrets.WORKSPACE_PREFIX }}/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install ${{ secrets.WORKSPACE_PREFIX }}/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||||
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
|
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
|
||||||
- name: Prepare - reinstall lmdeploy - cu12
|
- name: Prepare - reinstall lmdeploy - cu12
|
||||||
@ -205,8 +207,8 @@ jobs:
|
|||||||
- name: modify config
|
- name: modify config
|
||||||
if: matrix.regression_func != 'chat_sub_fullbench'
|
if: matrix.regression_func != 'chat_sub_fullbench'
|
||||||
run: |
|
run: |
|
||||||
cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
|
cp -r ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/configs_cluster/volc.py .
|
||||||
cat /fs-computility/llm/qa-llm-cicd/config/test_config.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
|
cat ${{ secrets.WORKSPACE_PREFIX }}/config/test_config.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
|
||||||
- name: Run test
|
- name: Run test
|
||||||
uses: nick-fields/retry@v3
|
uses: nick-fields/retry@v3
|
||||||
with:
|
with:
|
||||||
@ -245,8 +247,8 @@ jobs:
|
|||||||
- name: modify config
|
- name: modify config
|
||||||
if: matrix.regression_func == 'chat_sub_fullbench'
|
if: matrix.regression_func == 'chat_sub_fullbench'
|
||||||
run: |
|
run: |
|
||||||
cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
|
cp -r ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/configs_cluster/volc.py .
|
||||||
cat /fs-computility/llm/qa-llm-cicd/config/test_config_sub.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
|
cat ${{ secrets.WORKSPACE_PREFIX }}/config/test_config_sub.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
|
||||||
- name: Run command testcase
|
- name: Run command testcase
|
||||||
if: matrix.regression_func == 'cmd'
|
if: matrix.regression_func == 'cmd'
|
||||||
run: |
|
run: |
|
||||||
@ -292,7 +294,7 @@ jobs:
|
|||||||
- name: Run testcase
|
- name: Run testcase
|
||||||
if: matrix.regression_func == 'chat_sub_fullbench'
|
if: matrix.regression_func == 'chat_sub_fullbench'
|
||||||
env:
|
env:
|
||||||
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache_subset
|
COMPASS_DATA_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/datasets/compass_data_cache_subset
|
||||||
run: |
|
run: |
|
||||||
. ${{env.CONDA_PATH}}/bin/activate
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
conda activate ${{env.CONDA_ENV}}
|
conda activate ${{env.CONDA_ENV}}
|
||||||
@ -333,7 +335,7 @@ jobs:
|
|||||||
conda activate ${{env.CONDA_ENV}}
|
conda activate ${{env.CONDA_ENV}}
|
||||||
conda info --envs
|
conda info --envs
|
||||||
export from_tf=TRUE
|
export from_tf=TRUE
|
||||||
opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
|
opencompass ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
|
||||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily
|
||||||
python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py
|
python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
|
|
||||||
|
@ -45,6 +45,7 @@ class DLCRunner(BaseRunner):
|
|||||||
debug: bool = False,
|
debug: bool = False,
|
||||||
lark_bot_url: str = None,
|
lark_bot_url: str = None,
|
||||||
keep_tmp_file: bool = True,
|
keep_tmp_file: bool = True,
|
||||||
|
preemptible: bool = False,
|
||||||
):
|
):
|
||||||
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
|
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
|
||||||
self.aliyun_cfg = aliyun_cfg
|
self.aliyun_cfg = aliyun_cfg
|
||||||
@ -53,6 +54,7 @@ class DLCRunner(BaseRunner):
|
|||||||
|
|
||||||
self.eval_with_gpu = eval_with_gpu
|
self.eval_with_gpu = eval_with_gpu
|
||||||
self.keep_tmp_file = keep_tmp_file
|
self.keep_tmp_file = keep_tmp_file
|
||||||
|
self.preemptible = preemptible
|
||||||
if lark_bot_url:
|
if lark_bot_url:
|
||||||
self.lark_reporter = LarkReporter(lark_bot_url)
|
self.lark_reporter = LarkReporter(lark_bot_url)
|
||||||
else:
|
else:
|
||||||
@ -132,14 +134,16 @@ class DLCRunner(BaseRunner):
|
|||||||
shell_cmd = (f'source {bashrc_path}; '
|
shell_cmd = (f'source {bashrc_path}; '
|
||||||
f'conda activate {conda_env_name}; ')
|
f'conda activate {conda_env_name}; ')
|
||||||
shell_cmd += f'export PYTHONPATH={pwd}:$PYTHONPATH; '
|
shell_cmd += f'export PYTHONPATH={pwd}:$PYTHONPATH; '
|
||||||
else:
|
elif self.aliyun_cfg.get('python_env_path') is not None:
|
||||||
# using public conda env
|
# using public conda env
|
||||||
# users can also set `python_env_path` to their
|
# users can also set `python_env_path` to their
|
||||||
# own env python path
|
# own env python path
|
||||||
assert self.aliyun_cfg.get('python_env_path') is not None
|
|
||||||
shell_cmd = (
|
shell_cmd = (
|
||||||
f'export PATH={self.aliyun_cfg["python_env_path"]}/bin:$PATH; ' # noqa: E501
|
f'''export PATH={self.aliyun_cfg['python_env_path']}/bin:$PATH; ''' # noqa: E501
|
||||||
f'export PYTHONPATH={pwd}:$PYTHONPATH; ')
|
f'export PYTHONPATH={pwd}:$PYTHONPATH; ')
|
||||||
|
else:
|
||||||
|
# using system python
|
||||||
|
shell_cmd = ''
|
||||||
|
|
||||||
huggingface_cache = self.aliyun_cfg.get('huggingface_cache')
|
huggingface_cache = self.aliyun_cfg.get('huggingface_cache')
|
||||||
if huggingface_cache is not None:
|
if huggingface_cache is not None:
|
||||||
@ -178,8 +182,9 @@ class DLCRunner(BaseRunner):
|
|||||||
task_priority = self.aliyun_cfg.get('priority', 1)
|
task_priority = self.aliyun_cfg.get('priority', 1)
|
||||||
worker_cpu = self.aliyun_cfg.get('worker_cpu', 12)
|
worker_cpu = self.aliyun_cfg.get('worker_cpu', 12)
|
||||||
worker_memory = self.aliyun_cfg.get('worker_memory', 192)
|
worker_memory = self.aliyun_cfg.get('worker_memory', 192)
|
||||||
config_path = (f" --config {self.aliyun_cfg['dlc_config_path']}"
|
config_path = (
|
||||||
if 'dlc_config_path' in self.aliyun_cfg else '')
|
f''' --config {self.aliyun_cfg['dlc_config_path']}'''
|
||||||
|
if 'dlc_config_path' in self.aliyun_cfg else '')
|
||||||
|
|
||||||
# Different dlc versions has different commands
|
# Different dlc versions has different commands
|
||||||
if self.aliyun_cfg.get('dlc_job_cmd') == 'create':
|
if self.aliyun_cfg.get('dlc_job_cmd') == 'create':
|
||||||
@ -188,29 +193,36 @@ class DLCRunner(BaseRunner):
|
|||||||
else:
|
else:
|
||||||
dlc_job_cmd = 'submit pytorchjob'
|
dlc_job_cmd = 'submit pytorchjob'
|
||||||
worker_cmd = ' --workers 1'
|
worker_cmd = ' --workers 1'
|
||||||
|
|
||||||
|
pre_cmd = self.aliyun_cfg.get('pre_cmd')
|
||||||
|
if pre_cmd is not None:
|
||||||
|
shell_cmd = pre_cmd + '; ' + shell_cmd
|
||||||
|
|
||||||
tmpl = (
|
tmpl = (
|
||||||
f'dlc {dlc_job_cmd}'
|
f'dlc {dlc_job_cmd}'
|
||||||
f" --command '{shell_cmd}'"
|
f''' --command '{shell_cmd}' '''
|
||||||
f' --name {task_name[:512]}'
|
f' --name {task_name[:512]}'
|
||||||
f'{config_path}'
|
f'{config_path}'
|
||||||
f" --workspace_id {self.aliyun_cfg['workspace_id']}"
|
f''' --workspace_id {self.aliyun_cfg['workspace_id']}'''
|
||||||
f" --resource_id={self.aliyun_cfg['resource_id']}"
|
f''' --resource_id={self.aliyun_cfg['resource_id']}'''
|
||||||
f' --priority {task_priority}'
|
f' --priority {task_priority}'
|
||||||
f'{worker_cmd}'
|
f'{worker_cmd}'
|
||||||
f' --worker_cpu {max(num_gpus * 8, worker_cpu)}'
|
f' --worker_cpu {max(num_gpus * 8, worker_cpu)}'
|
||||||
f' --worker_gpu {num_gpus}'
|
f' --worker_gpu {num_gpus}'
|
||||||
f' --worker_memory {max(num_gpus * 128, worker_memory)}Gi'
|
f' --worker_memory {max(num_gpus * 128, worker_memory)}Gi'
|
||||||
f" --worker_image {self.aliyun_cfg['worker_image']}"
|
f''' --worker_image {self.aliyun_cfg['worker_image']}'''
|
||||||
f" --data_sources={','.join(self.aliyun_cfg['data_sources'])}")
|
f''' --data_sources={','.join(self.aliyun_cfg['data_sources'])}''' # noqa: E501
|
||||||
|
f''' --enable_priority_preemption={self.preemptible}''')
|
||||||
get_cmd = partial(task.get_command,
|
get_cmd = partial(task.get_command,
|
||||||
cfg_path=param_file,
|
cfg_path=param_file,
|
||||||
template=tmpl)
|
template=tmpl)
|
||||||
cmd = get_cmd()
|
cmd = get_cmd()
|
||||||
|
|
||||||
# Use specified python env instead of sys.executable
|
# Use specified python env instead of sys.executable
|
||||||
if self.aliyun_cfg['python_env_path']:
|
if self.aliyun_cfg['python_env_path']:
|
||||||
cmd = cmd.replace(
|
cmd = cmd.replace(
|
||||||
sys.executable,
|
sys.executable,
|
||||||
f'{self.aliyun_cfg["python_env_path"]}/bin/python',
|
f'''{self.aliyun_cfg['python_env_path']}/bin/python''',
|
||||||
)
|
)
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
logger.debug(f'Running command: {cmd}')
|
logger.debug(f'Running command: {cmd}')
|
||||||
@ -254,7 +266,7 @@ class DLCRunner(BaseRunner):
|
|||||||
pri_time = None
|
pri_time = None
|
||||||
initial_time = datetime.datetime.now()
|
initial_time = datetime.datetime.now()
|
||||||
|
|
||||||
url = f"https://pai.console.aliyun.com/?regionId=cn-wulanchabu&workspaceId={self.aliyun_cfg['workspace_id']}#/dlc/jobs/{job_id}" # noqa: E501
|
url = f'''https://pai.console.aliyun.com/?regionId=cn-wulanchabu&workspaceId={self.aliyun_cfg['workspace_id']}#/dlc/jobs/{job_id}''' # noqa: E501
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
logger.debug('\n' + '*' * 168 + '\n' + url + '\n' + '*' * 168)
|
logger.debug('\n' + '*' * 168 + '\n' + url + '\n' + '*' * 168)
|
||||||
|
|
||||||
|
@ -363,6 +363,7 @@ class DefaultSummarizer:
|
|||||||
f.write(text)
|
f.write(text)
|
||||||
self.logger.info(f'write summary to {osp.abspath(output_path)}')
|
self.logger.info(f'write summary to {osp.abspath(output_path)}')
|
||||||
|
|
||||||
|
table = [[col.replace(',', ' ') if isinstance(col, str) else col for col in row] for row in table]
|
||||||
with open(output_csv_path, 'w', encoding='utf-8') as f:
|
with open(output_csv_path, 'w', encoding='utf-8') as f:
|
||||||
f.write('\n'.join([','.join(row) for row in table]) + '\n')
|
f.write('\n'.join([','.join(row) for row in table]) + '\n')
|
||||||
self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')
|
self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')
|
||||||
|
Loading…
Reference in New Issue
Block a user