[Feature] DLC runner Lark report (#1735)

* [Bump] Bump version to 0.3.7

* DLC lark report update
This commit is contained in:
Linchen Xiao 2024-12-04 18:03:12 +08:00 committed by GitHub
parent e2a290fd46
commit a011be6798
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 37 additions and 22 deletions

View File

@ -77,7 +77,8 @@ class BaseRunner:
else:
content = f'{getpass.getuser()}\'s '
content += f'{self.task_cfg.type} tasks finished. '
content += f'{num_succeeded} tasks succeeded.'
content += f'{num_succeeded} tasks succeeded.\n'
content += '\n'.join([task for task, _ in status])
self.lark_reporter.post(title='Great news: all tasks '
'finished!',
content=content)

View File

@ -15,7 +15,7 @@ from mmengine.config import ConfigDict
from mmengine.utils import track_parallel_progress
from opencompass.registry import RUNNERS, TASKS
from opencompass.utils import get_logger
from opencompass.utils import LarkReporter, get_logger
from .base import BaseRunner
@ -35,15 +35,17 @@ class DLCRunner(BaseRunner):
lark_bot_url (str): Lark bot url. Default: None.
"""
def __init__(self,
task: ConfigDict,
aliyun_cfg: ConfigDict,
max_num_workers: int = 32,
eval_with_gpu: list = ['plugin_eval'],
retry: int = 2,
debug: bool = False,
lark_bot_url: str = None,
keep_tmp_file: bool = False):
def __init__(
self,
task: ConfigDict,
aliyun_cfg: ConfigDict,
max_num_workers: int = 32,
eval_with_gpu: list = ['plugin_eval'],
retry: int = 2,
debug: bool = False,
lark_bot_url: str = None,
keep_tmp_file: bool = True,
):
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
self.aliyun_cfg = aliyun_cfg
self.max_num_workers = max_num_workers
@ -51,6 +53,10 @@ class DLCRunner(BaseRunner):
self.eval_with_gpu = eval_with_gpu
self.keep_tmp_file = keep_tmp_file
if lark_bot_url:
self.lark_reporter = LarkReporter(lark_bot_url)
else:
self.lark_reporter = None
logger = get_logger()
logger.warning(
'To ensure the integrity of the log results, the log displayed '
@ -68,10 +74,12 @@ class DLCRunner(BaseRunner):
"""
if not self.debug:
status = track_parallel_progress(self._launch,
tasks,
nproc=self.max_num_workers,
keep_order=False)
status = track_parallel_progress(
self._launch,
tasks,
nproc=self.max_num_workers,
keep_order=False,
)
else:
status = [self._launch(task, random_sleep=False) for task in tasks]
return status
@ -92,7 +100,7 @@ class DLCRunner(BaseRunner):
tuple[str, int]: Task name and exit code.
"""
if random_sleep is None:
random_sleep = (self.max_num_workers > 32)
random_sleep = self.max_num_workers > 32
task = TASKS.build(dict(cfg=cfg, type=self.task_cfg['type']))
num_gpus = task.num_gpus
@ -109,6 +117,7 @@ class DLCRunner(BaseRunner):
mmengine.mkdir_or_exist('tmp/')
# Using uuid to avoid filename conflict
import uuid
uuid_str = str(uuid.uuid4())
param_file = f'tmp/{uuid_str}_params.py'
pwd = os.getcwd()
@ -201,7 +210,8 @@ class DLCRunner(BaseRunner):
if self.aliyun_cfg['python_env_path']:
cmd = cmd.replace(
sys.executable,
f'{self.aliyun_cfg["python_env_path"]}/bin/python')
f'{self.aliyun_cfg["python_env_path"]}/bin/python',
)
logger = get_logger()
logger.debug(f'Running command: {cmd}')
@ -259,10 +269,9 @@ class DLCRunner(BaseRunner):
try:
raw_job_info = subprocess.getoutput(
f'dlc get job {job_id}{config_path}')
if raw_job_info.startswith(
'/bin/bash') or raw_job_info.startswith(
'[OK]') or raw_job_info.startswith(
'[FAILED]'):
if (raw_job_info.startswith('/bin/bash')
or raw_job_info.startswith('[OK]')
or raw_job_info.startswith('[FAILED]')):
raw_job_info = raw_job_info[raw_job_info.
index('\n') + 1:]
job_info = json.loads(raw_job_info)
@ -325,6 +334,11 @@ class DLCRunner(BaseRunner):
else:
pass
# Lark Report when failed
if return_code == -1:
content = f'DLC job failed. Task name: {task_name}'
self.lark_reporter.post(title='DLC job failed', content=content)
return task_name, return_code
def _job_failed(self, return_code: int, output_paths: List[str]) -> bool:

View File

@ -48,7 +48,7 @@ class VOLCRunner(BaseRunner):
retry: int = 2,
debug: bool = False,
lark_bot_url: str = None,
keep_tmp_file: bool = False):
keep_tmp_file: bool = True):
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
self.volcano_cfg = volcano_cfg
self.max_num_workers = max_num_workers