[Feature] DLC runner Lark report (#1735)

* [Bump] Bump version to 0.3.7 * DLC lark report update
2025-05-30 16:03:24 +08:00 · 2024-12-04 18:03:12 +08:00 · 2024-12-04 18:03:12 +08:00 · a011be6798
commit a011be6798
parent e2a290fd46
3 changed files with 37 additions and 22 deletions
--- a/opencompass/runners/base.py
+++ b/opencompass/runners/base.py
@ -77,7 +77,8 @@ class BaseRunner:
            else:
                content = f'{getpass.getuser()}\'s '
                content += f'{self.task_cfg.type} tasks finished. '
-                content += f'{num_succeeded} tasks succeeded.'
+                content += f'{num_succeeded} tasks succeeded.\n'
+                content += '\n'.join([task for task, _ in status])
                self.lark_reporter.post(title='Great news: all tasks '
                                        'finished!',
                                        content=content)
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@ -15,7 +15,7 @@ from mmengine.config import ConfigDict
 from mmengine.utils import track_parallel_progress

 from opencompass.registry import RUNNERS, TASKS
-from opencompass.utils import get_logger
+from opencompass.utils import LarkReporter, get_logger

 from .base import BaseRunner

@ -35,15 +35,17 @@ class DLCRunner(BaseRunner):
        lark_bot_url (str): Lark bot url. Default: None.
    """

-    def __init__(self,
-                 task: ConfigDict,
-                 aliyun_cfg: ConfigDict,
-                 max_num_workers: int = 32,
-                 eval_with_gpu: list = ['plugin_eval'],
-                 retry: int = 2,
-                 debug: bool = False,
-                 lark_bot_url: str = None,
-                 keep_tmp_file: bool = False):
+    def __init__(
+        self,
+        task: ConfigDict,
+        aliyun_cfg: ConfigDict,
+        max_num_workers: int = 32,
+        eval_with_gpu: list = ['plugin_eval'],
+        retry: int = 2,
+        debug: bool = False,
+        lark_bot_url: str = None,
+        keep_tmp_file: bool = True,
+    ):
        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
        self.aliyun_cfg = aliyun_cfg
        self.max_num_workers = max_num_workers
@ -51,6 +53,10 @@ class DLCRunner(BaseRunner):

        self.eval_with_gpu = eval_with_gpu
        self.keep_tmp_file = keep_tmp_file
+        if lark_bot_url:
+            self.lark_reporter = LarkReporter(lark_bot_url)
+        else:
+            self.lark_reporter = None
        logger = get_logger()
        logger.warning(
            'To ensure the integrity of the log results, the log displayed '
@ -68,10 +74,12 @@ class DLCRunner(BaseRunner):
        """

        if not self.debug:
-            status = track_parallel_progress(self._launch,
-                                             tasks,
-                                             nproc=self.max_num_workers,
-                                             keep_order=False)
+            status = track_parallel_progress(
+                self._launch,
+                tasks,
+                nproc=self.max_num_workers,
+                keep_order=False,
+            )
        else:
            status = [self._launch(task, random_sleep=False) for task in tasks]
        return status
@ -92,7 +100,7 @@ class DLCRunner(BaseRunner):
            tuple[str, int]: Task name and exit code.
        """
        if random_sleep is None:
-            random_sleep = (self.max_num_workers > 32)
+            random_sleep = self.max_num_workers > 32

        task = TASKS.build(dict(cfg=cfg, type=self.task_cfg['type']))
        num_gpus = task.num_gpus
@ -109,6 +117,7 @@ class DLCRunner(BaseRunner):
        mmengine.mkdir_or_exist('tmp/')
        # Using uuid to avoid filename conflict
        import uuid
+
        uuid_str = str(uuid.uuid4())
        param_file = f'tmp/{uuid_str}_params.py'
        pwd = os.getcwd()
@ -201,7 +210,8 @@ class DLCRunner(BaseRunner):
            if self.aliyun_cfg['python_env_path']:
                cmd = cmd.replace(
                    sys.executable,
-                    f'{self.aliyun_cfg["python_env_path"]}/bin/python')
+                    f'{self.aliyun_cfg["python_env_path"]}/bin/python',
+                )
            logger = get_logger()
            logger.debug(f'Running command: {cmd}')

@ -259,10 +269,9 @@ class DLCRunner(BaseRunner):
                        try:
                            raw_job_info = subprocess.getoutput(
                                f'dlc get job {job_id}{config_path}')
-                            if raw_job_info.startswith(
-                                    '/bin/bash') or raw_job_info.startswith(
-                                        '[OK]') or raw_job_info.startswith(
-                                            '[FAILED]'):
+                            if (raw_job_info.startswith('/bin/bash')
+                                    or raw_job_info.startswith('[OK]')
+                                    or raw_job_info.startswith('[FAILED]')):
                                raw_job_info = raw_job_info[raw_job_info.
                                                            index('\n') + 1:]
                            job_info = json.loads(raw_job_info)
@ -325,6 +334,11 @@ class DLCRunner(BaseRunner):
            else:
                pass

+        # Lark Report when failed
+        if return_code == -1:
+            content = f'DLC job failed. Task name: {task_name}'
+            self.lark_reporter.post(title='DLC job failed', content=content)
+
        return task_name, return_code

    def _job_failed(self, return_code: int, output_paths: List[str]) -> bool:
--- a/opencompass/runners/volc.py
+++ b/opencompass/runners/volc.py
@ -48,7 +48,7 @@ class VOLCRunner(BaseRunner):
                 retry: int = 2,
                 debug: bool = False,
                 lark_bot_url: str = None,
-                 keep_tmp_file: bool = False):
+                 keep_tmp_file: bool = True):
        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
        self.volcano_cfg = volcano_cfg
        self.max_num_workers = max_num_workers