import datetime import json import os import os.path as osp import random import re import subprocess import sys import time from functools import partial from typing import Any, Dict, List, Optional, Tuple import mmengine from mmengine.config import ConfigDict from mmengine.utils import track_parallel_progress from opencompass.registry import RUNNERS, TASKS from opencompass.utils import LarkReporter, get_logger from .base import BaseRunner @RUNNERS.register_module() class DLCRunner(BaseRunner): """Distributed runner based on Alibaba Cloud Deep Learning Cluster (DLC). It will launch multiple tasks in parallel with 'dlc' command. Please install and configure DLC first before using this runner. Args: task (ConfigDict): Task type config. aliyun_cfg (ConfigDict): Alibaba Cloud config. max_num_workers (int): Max number of workers. Default: 32. retry (int): Number of retries when job failed. Default: 2. debug (bool): Whether to run in debug mode. Default: False. lark_bot_url (str): Lark bot url. Default: None. """ def __init__( self, task: ConfigDict, aliyun_cfg: ConfigDict, max_num_workers: int = 32, eval_with_gpu: list = ['plugin_eval'], retry: int = 2, debug: bool = False, lark_bot_url: str = None, keep_tmp_file: bool = True, preemptible: bool = False, ): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.aliyun_cfg = aliyun_cfg self.max_num_workers = max_num_workers self.retry = retry self.eval_with_gpu = eval_with_gpu self.keep_tmp_file = keep_tmp_file self.preemptible = preemptible if lark_bot_url: self.lark_reporter = LarkReporter(lark_bot_url) else: self.lark_reporter = None logger = get_logger() logger.warning( 'To ensure the integrity of the log results, the log displayed ' f'by {self.__class__.__name__} has a 10-second delay.') def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]: """Launch multiple tasks. Args: tasks (list[dict]): A list of task configs, usually generated by Partitioner. Returns: list[tuple[str, int]]: A list of (task name, exit code). """ if not self.debug: status = track_parallel_progress( self._launch, tasks, nproc=self.max_num_workers, keep_order=False, ) else: status = [self._launch(task, random_sleep=False) for task in tasks] return status def _launch(self, cfg: ConfigDict, random_sleep: Optional[bool] = None): """Launch a single task. Args: cfg (ConfigDict): Task config. random_sleep (bool): Whether to sleep for a random time before running the command. When Aliyun has many tasks to schedule, its stability decreases. Therefore, when we need to submit a large number of tasks at once, we adopt the "random_sleep" strategy. Tasks that would have been submitted all at once are now evenly spread out over a 10-second period. Default: None. Returns: tuple[str, int]: Task name and exit code. """ if random_sleep is None: random_sleep = self.max_num_workers > 32 task = TASKS.build(dict(cfg=cfg, type=self.task_cfg['type'])) num_gpus = task.num_gpus task_name = task.name is_eval_task = 'OpenICLEval' in task_name if is_eval_task and num_gpus == 0: for check_name in self.eval_with_gpu: if check_name in task_name: num_gpus = 1 break # Dump task config to file mmengine.mkdir_or_exist('tmp/') # Using uuid to avoid filename conflict import uuid uuid_str = str(uuid.uuid4()) param_file = f'tmp/{uuid_str}_params.py' pwd = os.getcwd() try: cfg.dump(param_file) if self.aliyun_cfg.get('bashrc_path') is not None: # using user's conda env bashrc_path = self.aliyun_cfg['bashrc_path'] assert osp.exists(bashrc_path) assert self.aliyun_cfg.get('conda_env_name') is not None conda_env_name = self.aliyun_cfg['conda_env_name'] shell_cmd = (f'source {bashrc_path}; ' f'conda activate {conda_env_name}; ') shell_cmd += f'export PYTHONPATH={pwd}:$PYTHONPATH; ' elif self.aliyun_cfg.get('python_env_path') is not None: # using public conda env # users can also set `python_env_path` to their # own env python path shell_cmd = ( f'''export PATH={self.aliyun_cfg['python_env_path']}/bin:$PATH; ''' # noqa: E501 f'export PYTHONPATH={pwd}:$PYTHONPATH; ') else: # using system python shell_cmd = '' huggingface_cache = self.aliyun_cfg.get('huggingface_cache') if huggingface_cache is not None: # HUGGINGFACE_HUB_CACHE is a Legacy env variable, here we set # `HF_HUB_CACHE` and `HUGGINGFACE_HUB_CACHE` for bc shell_cmd += f'export HF_HUB_CACHE={huggingface_cache}; ' shell_cmd += f'export HUGGINGFACE_HUB_CACHE={huggingface_cache}; ' # noqa: E501 torch_cache = self.aliyun_cfg.get('torch_cache') if torch_cache is not None: shell_cmd += f'export TORCH_HOME={torch_cache}; ' hf_offline = self.aliyun_cfg.get('hf_offline', True) if hf_offline: shell_cmd += 'export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_EVALUATE_OFFLINE=1; export HF_HUB_OFFLINE=1; ' # noqa: E501 http_proxy = self.aliyun_cfg.get('http_proxy') if http_proxy is not None: shell_cmd += f'export http_proxy={http_proxy}; export https_proxy={http_proxy}; ' # noqa: E501 shell_cmd += f'export HTTP_PROXY={http_proxy}; export HTTPS_PROXY={http_proxy}; ' # noqa: E501 hf_endpoint = self.aliyun_cfg.get('hf_endpoint') if hf_endpoint is not None: shell_cmd += f'export HF_ENDPOINT={hf_endpoint}; ' extra_envs = self.aliyun_cfg.get('extra_envs') if extra_envs is not None: for extra_env in extra_envs: shell_cmd += f'export {extra_env}; ' shell_cmd += f'cd {pwd}; ' shell_cmd += 'umask 0000; ' shell_cmd += '{task_cmd}' # set priority to 1 as default task_priority = self.aliyun_cfg.get('priority', 1) worker_cpu = self.aliyun_cfg.get('worker_cpu', 12) worker_memory = self.aliyun_cfg.get('worker_memory', 192) config_path = ( f''' --config {self.aliyun_cfg['dlc_config_path']}''' if 'dlc_config_path' in self.aliyun_cfg else '') # Different dlc versions has different commands if self.aliyun_cfg.get('dlc_job_cmd') == 'create': dlc_job_cmd = 'create job --kind PyTorchJob' worker_cmd = ' --worker_count 1' else: dlc_job_cmd = 'submit pytorchjob' worker_cmd = ' --workers 1' pre_cmd = self.aliyun_cfg.get('pre_cmd') if pre_cmd is not None: shell_cmd = pre_cmd + '; ' + shell_cmd tmpl = ( f'dlc {dlc_job_cmd}' f''' --command '{shell_cmd}' ''' f' --name {task_name[:512]}' f'{config_path}' f''' --workspace_id {self.aliyun_cfg['workspace_id']}''' f''' --resource_id={self.aliyun_cfg['resource_id']}''' f' --priority {task_priority}' f'{worker_cmd}' f' --worker_cpu {max(num_gpus * 8, worker_cpu)}' f' --worker_gpu {num_gpus}' f' --worker_memory {max(num_gpus * 128, worker_memory)}Gi' f''' --worker_image {self.aliyun_cfg['worker_image']}''' f''' --data_sources={','.join(self.aliyun_cfg['data_sources'])}''' # noqa: E501 f''' --enable_priority_preemption={self.preemptible}''') get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl) cmd = get_cmd() # Use specified python env instead of sys.executable if self.aliyun_cfg['python_env_path']: cmd = cmd.replace( sys.executable, f'''{self.aliyun_cfg['python_env_path']}/bin/python''', ) logger = get_logger() logger.debug(f'Running command: {cmd}') # Run command with retry if self.debug: stdout = sys.stdout else: out_path = task.get_log_path(file_extension='out') mmengine.mkdir_or_exist(osp.split(out_path)[0]) stdout = open(out_path, 'w', encoding='utf-8') if random_sleep: time.sleep(random.randint(0, 10)) def _run_within_retry(): num_retry_to_start = 5 index_to_start = 0 while index_to_start < num_retry_to_start: index_to_start += 1 try: output = subprocess.getoutput(cmd) except BlockingIOError: output = '' match = re.search(r'\|\s+(dlc[0-9a-z]+)\s+\|', output) if match is None: stdout.write('Failed to get job id from output:') stdout.write(output) if index_to_start < num_retry_to_start: stdout.write(f'Retry #{index_to_start} starting') time.sleep(2) continue else: job_id = match.group(1) stdout.write(output) break else: raise RuntimeError(f'Cannot get job id from {output}') pod_create_time = None pri_time = None initial_time = datetime.datetime.now() url = f'''https://pai.console.aliyun.com/?regionId=cn-wulanchabu&workspaceId={self.aliyun_cfg['workspace_id']}#/dlc/jobs/{job_id}''' # noqa: E501 logger = get_logger() logger.debug('\n' + '*' * 168 + '\n' + url + '\n' + '*' * 168) while True: # 1. Avoid to request dlc too frequently. # 2. DLC job may not be ready immediately after creation. dlc_sleep_time = self.aliyun_cfg.get('dlc_sleep_time', 10) time.sleep(dlc_sleep_time) num_retry = 60 for retry_index in range(num_retry): time.sleep(2) try: raw_job_info = subprocess.getoutput( f'dlc get job {job_id}{config_path}') if (raw_job_info.startswith('/bin/bash') or raw_job_info.startswith('[OK]') or raw_job_info.startswith('[FAILED]')): raw_job_info = raw_job_info[raw_job_info. index('\n') + 1:] job_info = json.loads(raw_job_info) break except: # noqa: E722 if retry_index > num_retry // 3: logger.warning( f'Failed to get job info for {job_id}, ' 'retrying...') else: raise RuntimeError( f'Failed to get job info for {job_id}') status = job_info['Status'] if status == 'Failed' or status == 'Stopped': return -1 elif status == 'Succeeded': return 0 elif status != 'Running': continue # The pod time could be different from the real time. # Therefore we need to extract the pod start time from # the `job_info` and calculate the `start_time` and # `end_time` in pod. if pod_create_time is None: pod_create_time = job_info['GmtCreateTime'] pri_time = pod_create_time pod_create_time = datetime.datetime.strptime( pod_create_time, '%Y-%m-%dT%H:%M:%SZ') elasped_time = datetime.datetime.now() - initial_time cur_time = (pod_create_time + elasped_time).strftime('%Y-%m-%dT%H:%M:%SZ') logs_cmd = ('dlc logs' f' {job_id} {job_id}-master-0' f'{config_path}' f' --start_time {pri_time}' f' --end_time {cur_time}') try: log_output = subprocess.getoutput(logs_cmd) except BlockingIOError: log_output = '[WARN] No logs found for the pod' if '[WARN] No logs found for the pod' not in log_output: pri_time = cur_time stdout.write(log_output) stdout.flush() return_code = _run_within_retry() retry = self.retry output_paths = task.get_output_paths() while self._job_failed(return_code, output_paths) and retry > 0: retry -= 1 cmd = get_cmd() return_code = _run_within_retry() finally: # Clean up if not self.keep_tmp_file: os.remove(param_file) else: pass # Lark Report when failed if return_code == -1 and self.lark_reporter is not None: content = f'DLC job failed. Task name: {task_name}' self.lark_reporter.post(title='DLC job failed', content=content) return task_name, return_code def _job_failed(self, return_code: int, output_paths: List[str]) -> bool: return return_code != 0 or not all( osp.exists(output_path) for output_path in output_paths)