This commit is contained in:
BigDong 2025-05-17 13:43:54 +08:00 committed by GitHub
commit 32c1e38207
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 89 additions and 33 deletions

View File

@ -141,7 +141,8 @@ class NumWorkerPartitioner(BasePartitioner):
dataset = build_dataset_from_cfg(dataset)
self.dataset_size[dataset_abbr] = len(dataset.test)
mmengine.mkdir_or_exist('.cache/')
tmp_dir_root = osp.dirname(self.dataset_size_path)
mmengine.mkdir_or_exist(tmp_dir_root)
mmengine.dump(self.dataset_size,
self.dataset_size_path,
indent=4,

View File

@ -214,7 +214,8 @@ class SizePartitioner(BasePartitioner):
dataset = build_dataset_from_cfg(dataset)
self.dataset_size[dataset_abbr] = len(dataset.test)
mmengine.mkdir_or_exist('.cache/')
tmp_dir_root = osp.dirname(self.dataset_size_path)
mmengine.mkdir_or_exist(tmp_dir_root)
mmengine.dump(self.dataset_size,
self.dataset_size_path,
indent=4,

View File

@ -198,7 +198,8 @@ class SubjectiveNumWorkerPartitioner(SubjectiveNaivePartitioner):
dataset = build_dataset_from_cfg(dataset)
self.dataset_size[dataset_abbr] = len(dataset.test)
mmengine.mkdir_or_exist('.cache/')
tmp_dir_root = osp.dirname(self.dataset_size_path)
mmengine.mkdir_or_exist(tmp_dir_root)
mmengine.dump(self.dataset_size,
self.dataset_size_path,
indent=4,

View File

@ -274,7 +274,8 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner):
dataset = build_dataset_from_cfg(dataset)
self.dataset_size[dataset_abbr] = len(dataset.test)
mmengine.mkdir_or_exist('.cache/')
tmp_dir_root = osp.dirname(self.dataset_size_path)
mmengine.mkdir_or_exist(tmp_dir_root)
mmengine.dump(self.dataset_size,
self.dataset_size_path,
indent=4,

View File

@ -15,18 +15,21 @@ class BaseRunner:
task (ConfigDict): Task type config.
debug (bool): Whether to run in debug mode.
lark_bot_url (str): Lark bot url.
tmp_dir (str): The directory to store temporary files.
"""
def __init__(self,
task: ConfigDict,
debug: bool = False,
lark_bot_url: str = None):
lark_bot_url: str = None,
tmp_dir: str = 'tmp'):
self.task_cfg = Config(task)
self.debug = debug
if lark_bot_url:
self.lark_reporter = LarkReporter(lark_bot_url)
else:
self.lark_reporter = None
self.tmp_dir = tmp_dir
def __call__(self, tasks: List[Dict[str, Any]]):
"""Launch multiple tasks and summarize the results.

View File

@ -33,6 +33,9 @@ class DLCRunner(BaseRunner):
retry (int): Number of retries when job failed. Default: 2.
debug (bool): Whether to run in debug mode. Default: False.
lark_bot_url (str): Lark bot url. Default: None.
keep_tmp_file (bool): Whether to keep the temporary file.
Default: True.
tmp_dir (str): The directory to store temporary files. Default: 'tmp'.
"""
def __init__(
@ -45,8 +48,12 @@ class DLCRunner(BaseRunner):
debug: bool = False,
lark_bot_url: str = None,
keep_tmp_file: bool = True,
tmp_dir: str = 'tmp',
):
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
super().__init__(task=task,
debug=debug,
lark_bot_url=lark_bot_url,
tmp_dir=tmp_dir)
self.aliyun_cfg = aliyun_cfg
self.max_num_workers = max_num_workers
self.retry = retry
@ -114,12 +121,13 @@ class DLCRunner(BaseRunner):
break
# Dump task config to file
mmengine.mkdir_or_exist('tmp/')
mmengine.mkdir_or_exist(self.tmp_dir)
# Using uuid to avoid filename conflict
import uuid
uuid_str = str(uuid.uuid4())
param_file = f'tmp/{uuid_str}_params.py'
param_file = f'{uuid_str}_params.py'
param_file = osp.join(self.tmp_dir, param_file)
pwd = os.getcwd()
try:
cfg.dump(param_file)

View File

@ -48,6 +48,10 @@ class LocalRunner(BaseRunner):
Defaults to 1.
debug (bool): Whether to run in debug mode.
lark_bot_url (str): Lark bot url.
keep_tmp_file (bool): Whether to keep the temporary file. Defaults to
False.
tmp_dir (str): The directory to store temporary files.
Defaults to 'tmp'.
"""
def __init__(self,
@ -57,8 +61,12 @@ class LocalRunner(BaseRunner):
max_workers_per_gpu: int = 1,
lark_bot_url: str = None,
keep_tmp_file: bool = False,
tmp_dir: str = 'tmp',
**kwargs):
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
super().__init__(task=task,
debug=debug,
lark_bot_url=lark_bot_url,
tmp_dir=tmp_dir)
self.max_num_workers = max_num_workers
self.max_workers_per_gpu = max_workers_per_gpu
self.keep_tmp_file = keep_tmp_file
@ -101,11 +109,12 @@ class LocalRunner(BaseRunner):
num_gpus = task.num_gpus
assert len(all_gpu_ids) >= num_gpus
# get cmd
mmengine.mkdir_or_exist('tmp/')
mmengine.mkdir_or_exist(self.tmp_dir)
import uuid
uuid_str = str(uuid.uuid4())
param_file = f'tmp/{uuid_str}_params.py'
param_file = f'{uuid_str}_params.py'
param_file = osp.join(self.tmp_dir, param_file)
try:
task.cfg.dump(param_file)
# if use torchrun, restrict it behaves the same as non
@ -135,7 +144,8 @@ class LocalRunner(BaseRunner):
else:
task.run()
else:
tmp_logs = f'tmp/{os.getpid()}_debug.log'
tmp_logs = f'{os.getpid()}_debug.log'
tmp_logs = osp.join(self.tmp_dir, tmp_logs)
get_logger().warning(
f'Debug mode, log will be saved to {tmp_logs}')
with open(tmp_logs, 'a') as log_file:
@ -207,14 +217,13 @@ class LocalRunner(BaseRunner):
task_name = task.name
pwd = os.getcwd()
# Dump task config to file
mmengine.mkdir_or_exist('tmp/')
mmengine.mkdir_or_exist(self.tmp_dir)
# Using uuid to avoid filename conflict
import uuid
uuid_str = str(uuid.uuid4())
param_file = f'{pwd}/tmp/{uuid_str}_params.py'
param_file = f'{uuid_str}_params.py'
param_file = osp.join(self.tmp_dir, param_file)
try:
task.cfg.dump(param_file)
tmpl = get_command_template(gpu_ids)

View File

@ -161,6 +161,8 @@ class LocalAPIRunner(BaseRunner):
Defaults to 16.
debug (bool): Whether to run in debug mode.
lark_bot_url (str): Lark bot url.
tmp_dir (str): The directory to store temporary files.
Defaults to 'tmp'.
"""
def __init__(self,
@ -168,8 +170,12 @@ class LocalAPIRunner(BaseRunner):
concurrent_users: int,
max_num_workers: int = 16,
debug: bool = False,
lark_bot_url: str = None):
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
lark_bot_url: str = None,
tmp_dir: str = 'tmp'):
super().__init__(task=task,
debug=debug,
lark_bot_url=lark_bot_url,
tmp_dir=tmp_dir)
self.max_num_workers = max_num_workers
self.concurrent_users = concurrent_users
assert task['type'] in [
@ -194,8 +200,9 @@ class LocalAPIRunner(BaseRunner):
task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))
task_name = task.name
# get cmd
mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py'
mmengine.mkdir_or_exist(self.tmp_dir)
param_file = f'{os.getpid()}_params.py'
param_file = osp.join(self.tmp_dir, param_file)
try:
task.cfg.dump(param_file)
cmd = task.get_command(cfg_path=param_file,

View File

@ -33,6 +33,8 @@ class SlurmRunner(BaseRunner):
lark_bot_url (str): Lark bot url. Defaults to None.
extra_command (List, optional): Extra slurm command.
For example ['-c 12', '-w node1']. Defaults to None.
tmp_dir (str): The directory to store temporary files.
Defaults to 'tmp'.
"""
def __init__(self,
@ -44,8 +46,12 @@ class SlurmRunner(BaseRunner):
qos: str = None,
debug: bool = False,
lark_bot_url: str = None,
extra_command: Optional[List[str]] = None):
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
extra_command: Optional[List[str]] = None,
tmp_dir: str = 'tmp'):
super().__init__(task=task,
debug=debug,
lark_bot_url=lark_bot_url,
tmp_dir=tmp_dir)
self.max_num_workers = max_num_workers
self.retry = retry
self.partition = partition
@ -93,8 +99,9 @@ class SlurmRunner(BaseRunner):
task_name = task.name
# Dump task config to file
mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py'
mmengine.mkdir_or_exist(self.tmp_dir)
param_file = f'{os.getpid()}_params.py'
param_file = osp.join(self.tmp_dir, param_file)
try:
cfg.dump(param_file)

View File

@ -47,6 +47,10 @@ class SlurmSequentialRunner(BaseRunner):
lark_bot_url (str): Lark bot url. Defaults to None.
extra_command (List, optional): Extra slurm command.
For example ['-c 12', '-w node1']. Defaults to None.
keep_tmp_file (bool): Whether to keep the temporary file. Defaults to
False.
tmp_dir (str): The directory to store temporary files.
Defaults to 'tmp'.
"""
def __init__(self,
@ -60,8 +64,12 @@ class SlurmSequentialRunner(BaseRunner):
debug: bool = False,
lark_bot_url: str = None,
extra_command: Optional[List[str]] = None,
keep_tmp_file: bool = False):
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
keep_tmp_file: bool = False,
tmp_dir: str = 'tmp'):
super().__init__(task=task,
debug=debug,
lark_bot_url=lark_bot_url,
tmp_dir=tmp_dir)
self.max_num_workers = max_num_workers
self.retry = retry
self.partition = partition
@ -172,11 +180,12 @@ class SlurmSequentialRunner(BaseRunner):
task_name = self.task_prefix + task_name
# Dump task config to file
mmengine.mkdir_or_exist('tmp/')
mmengine.mkdir_or_exist(self.tmp_dir)
# Using uuid to avoid filename conflict
import uuid
uuid_str = str(uuid.uuid4())
param_file = f'tmp/{uuid_str}_params.py'
param_file = f'{uuid_str}_params.py'
param_file = osp.join(self.tmp_dir, param_file)
process = None
try:
cfg.dump(param_file)

View File

@ -36,6 +36,9 @@ class VOLCRunner(BaseRunner):
retry (int): Number of retries when job failed. Default: 2.
debug (bool): Whether to run in debug mode. Default: False.
lark_bot_url (str): Lark bot url. Default: None.
keep_tmp_file (bool): Whether to keep the temporary file. Default:
False.
tmp_dir (str): The directory to store temporary files. Default: 'tmp'.
"""
def __init__(self,
@ -48,8 +51,12 @@ class VOLCRunner(BaseRunner):
retry: int = 2,
debug: bool = False,
lark_bot_url: str = None,
keep_tmp_file: bool = True):
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
keep_tmp_file: bool = True,
tmp_dir: str = 'tmp'):
super().__init__(task=task,
debug=debug,
lark_bot_url=lark_bot_url,
tmp_dir=tmp_dir)
self.volcano_cfg = volcano_cfg
self.max_num_workers = max_num_workers
self.retry = retry
@ -101,13 +108,15 @@ class VOLCRunner(BaseRunner):
# Build up VCC command
pwd = os.getcwd()
# Dump task config to file
mmengine.mkdir_or_exist('tmp/')
mmengine.mkdir_or_exist(self.tmp_dir)
# Using uuid to avoid filename conflict
import uuid
uuid_str = str(uuid.uuid4())
param_file = f'{pwd}/tmp/{uuid_str}_params.py'
param_file = f'{uuid_str}_params.py'
param_file = osp.join(self.tmp_dir, param_file)
volc_cfg_file = f'{pwd}/tmp/{uuid_str}_cfg.yaml'
volc_cfg_file = f'{uuid_str}_cfg.yaml'
volc_cfg_file = osp.join(self.tmp_dir, volc_cfg_file)
volc_cfg = self._choose_flavor(num_gpus)
with open(volc_cfg_file, 'w') as fp:
yaml.dump(volc_cfg, fp, sort_keys=False)