diff --git a/opencompass/runners/base.py b/opencompass/runners/base.py index 44bb2ac9..3cd2820f 100644 --- a/opencompass/runners/base.py +++ b/opencompass/runners/base.py @@ -77,7 +77,8 @@ class BaseRunner: else: content = f'{getpass.getuser()}\'s ' content += f'{self.task_cfg.type} tasks finished. ' - content += f'{num_succeeded} tasks succeeded.' + content += f'{num_succeeded} tasks succeeded.\n' + content += '\n'.join([task for task, _ in status]) self.lark_reporter.post(title='Great news: all tasks ' 'finished!', content=content) diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 45f7ec82..22a189d5 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -15,7 +15,7 @@ from mmengine.config import ConfigDict from mmengine.utils import track_parallel_progress from opencompass.registry import RUNNERS, TASKS -from opencompass.utils import get_logger +from opencompass.utils import LarkReporter, get_logger from .base import BaseRunner @@ -35,15 +35,17 @@ class DLCRunner(BaseRunner): lark_bot_url (str): Lark bot url. Default: None. """ - def __init__(self, - task: ConfigDict, - aliyun_cfg: ConfigDict, - max_num_workers: int = 32, - eval_with_gpu: list = ['plugin_eval'], - retry: int = 2, - debug: bool = False, - lark_bot_url: str = None, - keep_tmp_file: bool = False): + def __init__( + self, + task: ConfigDict, + aliyun_cfg: ConfigDict, + max_num_workers: int = 32, + eval_with_gpu: list = ['plugin_eval'], + retry: int = 2, + debug: bool = False, + lark_bot_url: str = None, + keep_tmp_file: bool = True, + ): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.aliyun_cfg = aliyun_cfg self.max_num_workers = max_num_workers @@ -51,6 +53,10 @@ class DLCRunner(BaseRunner): self.eval_with_gpu = eval_with_gpu self.keep_tmp_file = keep_tmp_file + if lark_bot_url: + self.lark_reporter = LarkReporter(lark_bot_url) + else: + self.lark_reporter = None logger = get_logger() logger.warning( 'To ensure the integrity of the log results, the log displayed ' @@ -68,10 +74,12 @@ class DLCRunner(BaseRunner): """ if not self.debug: - status = track_parallel_progress(self._launch, - tasks, - nproc=self.max_num_workers, - keep_order=False) + status = track_parallel_progress( + self._launch, + tasks, + nproc=self.max_num_workers, + keep_order=False, + ) else: status = [self._launch(task, random_sleep=False) for task in tasks] return status @@ -92,7 +100,7 @@ class DLCRunner(BaseRunner): tuple[str, int]: Task name and exit code. """ if random_sleep is None: - random_sleep = (self.max_num_workers > 32) + random_sleep = self.max_num_workers > 32 task = TASKS.build(dict(cfg=cfg, type=self.task_cfg['type'])) num_gpus = task.num_gpus @@ -109,6 +117,7 @@ class DLCRunner(BaseRunner): mmengine.mkdir_or_exist('tmp/') # Using uuid to avoid filename conflict import uuid + uuid_str = str(uuid.uuid4()) param_file = f'tmp/{uuid_str}_params.py' pwd = os.getcwd() @@ -201,7 +210,8 @@ class DLCRunner(BaseRunner): if self.aliyun_cfg['python_env_path']: cmd = cmd.replace( sys.executable, - f'{self.aliyun_cfg["python_env_path"]}/bin/python') + f'{self.aliyun_cfg["python_env_path"]}/bin/python', + ) logger = get_logger() logger.debug(f'Running command: {cmd}') @@ -259,10 +269,9 @@ class DLCRunner(BaseRunner): try: raw_job_info = subprocess.getoutput( f'dlc get job {job_id}{config_path}') - if raw_job_info.startswith( - '/bin/bash') or raw_job_info.startswith( - '[OK]') or raw_job_info.startswith( - '[FAILED]'): + if (raw_job_info.startswith('/bin/bash') + or raw_job_info.startswith('[OK]') + or raw_job_info.startswith('[FAILED]')): raw_job_info = raw_job_info[raw_job_info. index('\n') + 1:] job_info = json.loads(raw_job_info) @@ -325,6 +334,11 @@ class DLCRunner(BaseRunner): else: pass + # Lark Report when failed + if return_code == -1: + content = f'DLC job failed. Task name: {task_name}' + self.lark_reporter.post(title='DLC job failed', content=content) + return task_name, return_code def _job_failed(self, return_code: int, output_paths: List[str]) -> bool: diff --git a/opencompass/runners/volc.py b/opencompass/runners/volc.py index d48f7a43..b5f38223 100644 --- a/opencompass/runners/volc.py +++ b/opencompass/runners/volc.py @@ -48,7 +48,7 @@ class VOLCRunner(BaseRunner): retry: int = 2, debug: bool = False, lark_bot_url: str = None, - keep_tmp_file: bool = False): + keep_tmp_file: bool = True): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.volcano_cfg = volcano_cfg self.max_num_workers = max_num_workers