[Update] Update DLC runner (#1637)

This commit is contained in:
Linchen Xiao 2024-10-24 21:36:16 +08:00 committed by GitHub
parent fb12c3f98a
commit 22fdea4bf2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -164,20 +164,27 @@ class DLCRunner(BaseRunner):
# set priority to 1 as default # set priority to 1 as default
task_priority = self.aliyun_cfg.get('priority', 1) task_priority = self.aliyun_cfg.get('priority', 1)
# Different dlc versions has different commands
if self.aliyun_cfg.get('dlc_job_cmd') == 'create':
dlc_job_cmd = 'create job --kind PyTorchJob'
worker_cmd = ' --worker_count 1'
else:
dlc_job_cmd = 'submit pytorchjob'
worker_cmd = ' --workers 1'
tmpl = ( tmpl = (
'dlc submit pytorchjob' f'dlc {dlc_job_cmd}'
f" --command '{shell_cmd}'" f" --command '{shell_cmd}'"
f' --name {task_name[:512]}' f' --name {task_name[:512]}'
f" --config {self.aliyun_cfg['dlc_config_path']}" f" --config {self.aliyun_cfg['dlc_config_path']}"
f" --workspace_id {self.aliyun_cfg['workspace_id']}" f" --workspace_id {self.aliyun_cfg['workspace_id']}"
f" --resource_id {self.aliyun_cfg['resource_id']}" f" --resource_id={self.aliyun_cfg['resource_id']}"
f' --priority {task_priority}' f' --priority {task_priority}'
' --workers 1' f'{worker_cmd}'
f' --worker_cpu {max(num_gpus * 8, 12)}' f' --worker_cpu {max(num_gpus * 8, 12)}'
f' --worker_gpu {num_gpus}' f' --worker_gpu {num_gpus}'
f' --worker_memory {max(num_gpus * 128, 192)}Gi' f' --worker_memory {max(num_gpus * 128, 192)}Gi'
f" --worker_image {self.aliyun_cfg['worker_image']}" f" --worker_image {self.aliyun_cfg['worker_image']}"
f" --data_sources {','.join(self.aliyun_cfg['data_sources'])}") f" --data_sources={','.join(self.aliyun_cfg['data_sources'])}")
get_cmd = partial(task.get_command, get_cmd = partial(task.get_command,
cfg_path=param_file, cfg_path=param_file,
template=tmpl) template=tmpl)