mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] DLC runner Lark report (#1735)
* [Bump] Bump version to 0.3.7 * DLC lark report update
This commit is contained in:
parent
e2a290fd46
commit
a011be6798
@ -77,7 +77,8 @@ class BaseRunner:
|
||||
else:
|
||||
content = f'{getpass.getuser()}\'s '
|
||||
content += f'{self.task_cfg.type} tasks finished. '
|
||||
content += f'{num_succeeded} tasks succeeded.'
|
||||
content += f'{num_succeeded} tasks succeeded.\n'
|
||||
content += '\n'.join([task for task, _ in status])
|
||||
self.lark_reporter.post(title='Great news: all tasks '
|
||||
'finished!',
|
||||
content=content)
|
||||
|
@ -15,7 +15,7 @@ from mmengine.config import ConfigDict
|
||||
from mmengine.utils import track_parallel_progress
|
||||
|
||||
from opencompass.registry import RUNNERS, TASKS
|
||||
from opencompass.utils import get_logger
|
||||
from opencompass.utils import LarkReporter, get_logger
|
||||
|
||||
from .base import BaseRunner
|
||||
|
||||
@ -35,15 +35,17 @@ class DLCRunner(BaseRunner):
|
||||
lark_bot_url (str): Lark bot url. Default: None.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
task: ConfigDict,
|
||||
aliyun_cfg: ConfigDict,
|
||||
max_num_workers: int = 32,
|
||||
eval_with_gpu: list = ['plugin_eval'],
|
||||
retry: int = 2,
|
||||
debug: bool = False,
|
||||
lark_bot_url: str = None,
|
||||
keep_tmp_file: bool = False):
|
||||
def __init__(
|
||||
self,
|
||||
task: ConfigDict,
|
||||
aliyun_cfg: ConfigDict,
|
||||
max_num_workers: int = 32,
|
||||
eval_with_gpu: list = ['plugin_eval'],
|
||||
retry: int = 2,
|
||||
debug: bool = False,
|
||||
lark_bot_url: str = None,
|
||||
keep_tmp_file: bool = True,
|
||||
):
|
||||
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
|
||||
self.aliyun_cfg = aliyun_cfg
|
||||
self.max_num_workers = max_num_workers
|
||||
@ -51,6 +53,10 @@ class DLCRunner(BaseRunner):
|
||||
|
||||
self.eval_with_gpu = eval_with_gpu
|
||||
self.keep_tmp_file = keep_tmp_file
|
||||
if lark_bot_url:
|
||||
self.lark_reporter = LarkReporter(lark_bot_url)
|
||||
else:
|
||||
self.lark_reporter = None
|
||||
logger = get_logger()
|
||||
logger.warning(
|
||||
'To ensure the integrity of the log results, the log displayed '
|
||||
@ -68,10 +74,12 @@ class DLCRunner(BaseRunner):
|
||||
"""
|
||||
|
||||
if not self.debug:
|
||||
status = track_parallel_progress(self._launch,
|
||||
tasks,
|
||||
nproc=self.max_num_workers,
|
||||
keep_order=False)
|
||||
status = track_parallel_progress(
|
||||
self._launch,
|
||||
tasks,
|
||||
nproc=self.max_num_workers,
|
||||
keep_order=False,
|
||||
)
|
||||
else:
|
||||
status = [self._launch(task, random_sleep=False) for task in tasks]
|
||||
return status
|
||||
@ -92,7 +100,7 @@ class DLCRunner(BaseRunner):
|
||||
tuple[str, int]: Task name and exit code.
|
||||
"""
|
||||
if random_sleep is None:
|
||||
random_sleep = (self.max_num_workers > 32)
|
||||
random_sleep = self.max_num_workers > 32
|
||||
|
||||
task = TASKS.build(dict(cfg=cfg, type=self.task_cfg['type']))
|
||||
num_gpus = task.num_gpus
|
||||
@ -109,6 +117,7 @@ class DLCRunner(BaseRunner):
|
||||
mmengine.mkdir_or_exist('tmp/')
|
||||
# Using uuid to avoid filename conflict
|
||||
import uuid
|
||||
|
||||
uuid_str = str(uuid.uuid4())
|
||||
param_file = f'tmp/{uuid_str}_params.py'
|
||||
pwd = os.getcwd()
|
||||
@ -201,7 +210,8 @@ class DLCRunner(BaseRunner):
|
||||
if self.aliyun_cfg['python_env_path']:
|
||||
cmd = cmd.replace(
|
||||
sys.executable,
|
||||
f'{self.aliyun_cfg["python_env_path"]}/bin/python')
|
||||
f'{self.aliyun_cfg["python_env_path"]}/bin/python',
|
||||
)
|
||||
logger = get_logger()
|
||||
logger.debug(f'Running command: {cmd}')
|
||||
|
||||
@ -259,10 +269,9 @@ class DLCRunner(BaseRunner):
|
||||
try:
|
||||
raw_job_info = subprocess.getoutput(
|
||||
f'dlc get job {job_id}{config_path}')
|
||||
if raw_job_info.startswith(
|
||||
'/bin/bash') or raw_job_info.startswith(
|
||||
'[OK]') or raw_job_info.startswith(
|
||||
'[FAILED]'):
|
||||
if (raw_job_info.startswith('/bin/bash')
|
||||
or raw_job_info.startswith('[OK]')
|
||||
or raw_job_info.startswith('[FAILED]')):
|
||||
raw_job_info = raw_job_info[raw_job_info.
|
||||
index('\n') + 1:]
|
||||
job_info = json.loads(raw_job_info)
|
||||
@ -325,6 +334,11 @@ class DLCRunner(BaseRunner):
|
||||
else:
|
||||
pass
|
||||
|
||||
# Lark Report when failed
|
||||
if return_code == -1:
|
||||
content = f'DLC job failed. Task name: {task_name}'
|
||||
self.lark_reporter.post(title='DLC job failed', content=content)
|
||||
|
||||
return task_name, return_code
|
||||
|
||||
def _job_failed(self, return_code: int, output_paths: List[str]) -> bool:
|
||||
|
@ -48,7 +48,7 @@ class VOLCRunner(BaseRunner):
|
||||
retry: int = 2,
|
||||
debug: bool = False,
|
||||
lark_bot_url: str = None,
|
||||
keep_tmp_file: bool = False):
|
||||
keep_tmp_file: bool = True):
|
||||
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
|
||||
self.volcano_cfg = volcano_cfg
|
||||
self.max_num_workers = max_num_workers
|
||||
|
Loading…
Reference in New Issue
Block a user