[Sync] Use finally to clean up temp files (#337)

This commit is contained in:
Tong Gao 2023-09-04 15:20:16 +08:00 committed by GitHub
parent 2cd994c3d1
commit ce65d3393b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 233 additions and 230 deletions

View File

@ -86,11 +86,13 @@ class DLCRunner(BaseRunner):
# Dump task config to file # Dump task config to file
mmengine.mkdir_or_exist('tmp/') mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py' param_file = f'tmp/{os.getpid()}_params.py'
try:
task_cfg.dump(param_file) task_cfg.dump(param_file)
# Build up DLC command # Build up DLC command
pwd = os.getcwd() pwd = os.getcwd()
shell_cmd = (f'source {self.aliyun_cfg["bashrc_path"]}; ' shell_cmd = (
f'source {self.aliyun_cfg["bashrc_path"]}; '
f'conda activate {self.aliyun_cfg["conda_env_name"]}; ' f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
f'cd {pwd}; ' f'cd {pwd}; '
'{task_cmd}') '{task_cmd}')
@ -107,7 +109,9 @@ class DLCRunner(BaseRunner):
f' --worker_memory {max(num_gpus * 32, 48)}' f' --worker_memory {max(num_gpus * 32, 48)}'
f" --worker_image {self.aliyun_cfg['worker_image']}" f" --worker_image {self.aliyun_cfg['worker_image']}"
' --interactive') ' --interactive')
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl) get_cmd = partial(task.get_command,
cfg_path=param_file,
template=tmpl)
cmd = get_cmd() cmd = get_cmd()
logger = get_logger() logger = get_logger()
@ -131,7 +135,8 @@ class DLCRunner(BaseRunner):
retry = self.retry retry = self.retry
output_paths = task.get_output_paths() output_paths = task.get_output_paths()
while self._job_failed(result.returncode, output_paths) and retry > 0: while self._job_failed(result.returncode,
output_paths) and retry > 0:
retry -= 1 retry -= 1
if random_sleep: if random_sleep:
time.sleep(random.randint(0, 10)) time.sleep(random.randint(0, 10))
@ -142,7 +147,7 @@ class DLCRunner(BaseRunner):
text=True, text=True,
stdout=stdout, stdout=stdout,
stderr=stdout) stderr=stdout)
finally:
# Clean up # Clean up
os.remove(param_file) os.remove(param_file)
return task_name, result.returncode return task_name, result.returncode

View File

@ -62,6 +62,7 @@ class LocalRunner(BaseRunner):
# get cmd # get cmd
mmengine.mkdir_or_exist('tmp/') mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py' param_file = f'tmp/{os.getpid()}_params.py'
try:
task.cfg.dump(param_file) task.cfg.dump(param_file)
cmd = task.get_command(cfg_path=param_file, cmd = task.get_command(cfg_path=param_file,
template='{task_cmd}') template='{task_cmd}')
@ -70,6 +71,7 @@ class LocalRunner(BaseRunner):
task.run() task.run()
else: else:
subprocess.run(cmd, shell=True, text=True) subprocess.run(cmd, shell=True, text=True)
finally:
os.remove(param_file) os.remove(param_file)
status.append((task_name, 0)) status.append((task_name, 0))
else: else:
@ -141,12 +143,15 @@ class LocalRunner(BaseRunner):
# Dump task config to file # Dump task config to file
mmengine.mkdir_or_exist('tmp/') mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_{index}_params.py' param_file = f'tmp/{os.getpid()}_{index}_params.py'
try:
task.cfg.dump(param_file) task.cfg.dump(param_file)
# Build up slurm command # Build up slurm command
tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids) tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
tmpl += ' {task_cmd}' tmpl += ' {task_cmd}'
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl) get_cmd = partial(task.get_command,
cfg_path=param_file,
template=tmpl)
cmd = get_cmd() cmd = get_cmd()
logger = get_logger() logger = get_logger()
@ -165,7 +170,7 @@ class LocalRunner(BaseRunner):
if result.returncode != 0: if result.returncode != 0:
logger.warning(f'task {task_name} fail, see\n{out_path}') logger.warning(f'task {task_name} fail, see\n{out_path}')
finally:
# Clean up # Clean up
os.remove(param_file) os.remove(param_file)
return task_name, result.returncode return task_name, result.returncode

View File

@ -91,6 +91,7 @@ class SlurmRunner(BaseRunner):
# Dump task config to file # Dump task config to file
mmengine.mkdir_or_exist('tmp/') mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py' param_file = f'tmp/{os.getpid()}_params.py'
try:
task_cfg.dump(param_file) task_cfg.dump(param_file)
# Build up slurm command # Build up slurm command
@ -104,7 +105,9 @@ class SlurmRunner(BaseRunner):
if num_gpus > 0: if num_gpus > 0:
tmpl += f' --gres=gpu:{num_gpus}' tmpl += f' --gres=gpu:{num_gpus}'
tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}' tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl) get_cmd = partial(task.get_command,
cfg_path=param_file,
template=tmpl)
cmd = get_cmd() cmd = get_cmd()
logger = get_logger() logger = get_logger()
@ -128,7 +131,8 @@ class SlurmRunner(BaseRunner):
retry = self.retry retry = self.retry
output_paths = task.get_output_paths() output_paths = task.get_output_paths()
while self._job_failed(result.returncode, output_paths) and retry > 0: while self._job_failed(result.returncode,
output_paths) and retry > 0:
retry -= 1 retry -= 1
if random_sleep: if random_sleep:
time.sleep(random.randint(0, 10)) time.sleep(random.randint(0, 10))
@ -142,7 +146,7 @@ class SlurmRunner(BaseRunner):
if result.returncode != 0 and not self.debug: if result.returncode != 0 and not self.debug:
logger.warning(f'task {task_name} fail, see\n{out_path}') logger.warning(f'task {task_name} fail, see\n{out_path}')
finally:
# Clean up # Clean up
os.remove(param_file) os.remove(param_file)
return task_name, result.returncode return task_name, result.returncode

View File

@ -3,7 +3,9 @@ from typing import List, Union
import tabulate import tabulate
from mmengine.config import Config from mmengine.config import Config
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
from opencompass.utils import get_logger, match_files from opencompass.utils import get_logger, match_files
@ -118,54 +120,61 @@ def exec_mm_infer_runner(tasks, args, cfg):
runner(tasks) runner(tasks)
def exec_infer_runner(tasks, args, cfg): def get_config_type(obj) -> str:
"""execute infer runner according to args.""" return f'{obj.__module__}.{obj.__name__}'
if args.slurm:
runner = SlurmRunner(dict(type='OpenICLInferTask'),
max_num_workers=args.max_num_workers,
partition=args.partition,
quotatype=args.quotatype,
qos=args.qos,
retry=args.retry,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
elif args.dlc:
runner = DLCRunner(dict(type='OpenICLInferTask'),
max_num_workers=args.max_num_workers,
aliyun_cfg=Config.fromfile(args.aliyun_cfg),
retry=args.retry,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
else:
runner = LocalRunner(task=dict(type='OpenICLInferTask'),
max_num_workers=args.max_num_workers,
max_workers_per_gpu=args.max_workers_per_gpu,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
runner(tasks)
def exec_eval_runner(tasks, args, cfg): def fill_infer_cfg(cfg, args):
"""execute infer runner according to args.""" new_cfg = dict(infer=dict(
partitioner=dict(type=get_config_type(SizePartitioner),
max_task_size=args.max_partition_size,
gen_task_coef=args.gen_task_coef),
runner=dict(
max_num_workers=args.max_num_workers,
debug=args.debug,
task=dict(type=get_config_type(OpenICLInferTask)),
lark_bot_url=cfg['lark_bot_url'],
)), )
if args.slurm: if args.slurm:
runner = SlurmRunner(dict(type='OpenICLEvalTask'), new_cfg['infer']['runner']['type'] = get_config_type(SlurmRunner)
max_num_workers=args.max_num_workers, new_cfg['infer']['runner']['partition'] = args.partition
partition=args.partition, new_cfg['infer']['runner']['quotatype'] = args.quotatype
quotatype=args.quotatype, new_cfg['infer']['runner']['qos'] = args.qos
qos=args.qos, new_cfg['infer']['runner']['retry'] = args.retry
retry=args.retry,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
elif args.dlc: elif args.dlc:
runner = DLCRunner(dict(type='OpenICLEvalTask'), new_cfg['infer']['runner']['type'] = get_config_type(DLCRunner)
max_num_workers=args.max_num_workers, new_cfg['infer']['runner']['aliyun_cfg'] = Config.fromfile(
aliyun_cfg=Config.fromfile(args.aliyun_cfg), args.aliyun_cfg)
retry=args.retry, new_cfg['infer']['runner']['retry'] = args.retry
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
else: else:
runner = LocalRunner(task=dict(type='OpenICLEvalTask'), new_cfg['infer']['runner']['type'] = get_config_type(LocalRunner)
new_cfg['infer']['runner'][
'max_workers_per_gpu'] = args.max_workers_per_gpu
cfg.merge_from_dict(new_cfg)
def fill_eval_cfg(cfg, args):
new_cfg = dict(
eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
runner=dict(
max_num_workers=args.max_num_workers, max_num_workers=args.max_num_workers,
debug=args.debug, debug=args.debug,
lark_bot_url=cfg['lark_bot_url']) task=dict(type=get_config_type(OpenICLEvalTask)),
runner(tasks) lark_bot_url=cfg['lark_bot_url'],
)))
if args.slurm:
new_cfg['eval']['runner']['type'] = get_config_type(SlurmRunner)
new_cfg['eval']['runner']['partition'] = args.partition
new_cfg['eval']['runner']['quotatype'] = args.quotatype
new_cfg['eval']['runner']['qos'] = args.qos
new_cfg['eval']['runner']['retry'] = args.retry
elif args.dlc:
new_cfg['eval']['runner']['type'] = get_config_type(DLCRunner)
new_cfg['eval']['runner']['aliyun_cfg'] = Config.fromfile(
args.aliyun_cfg)
new_cfg['eval']['runner']['retry'] = args.retry
else:
new_cfg['eval']['runner']['type'] = get_config_type(LocalRunner)
new_cfg['eval']['runner'][
'max_workers_per_gpu'] = args.max_workers_per_gpu
cfg.merge_from_dict(new_cfg)

46
run.py
View File

@ -6,13 +6,12 @@ from datetime import datetime
from mmengine.config import Config, DictAction from mmengine.config import Config, DictAction
from opencompass.partitioners import (MultimodalNaivePartitioner, from opencompass.partitioners import MultimodalNaivePartitioner
NaivePartitioner, SizePartitioner)
from opencompass.registry import PARTITIONERS, RUNNERS from opencompass.registry import PARTITIONERS, RUNNERS
from opencompass.runners import SlurmRunner from opencompass.runners import SlurmRunner
from opencompass.utils import LarkReporter, Summarizer, get_logger from opencompass.utils import LarkReporter, Summarizer, get_logger
from opencompass.utils.run import (exec_eval_runner, exec_infer_runner, from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg,
exec_mm_infer_runner, get_config_from_arg) fill_infer_cfg, get_config_from_arg)
def parse_args(): def parse_args():
@ -245,20 +244,10 @@ def main():
tasks = partitioner(cfg) tasks = partitioner(cfg)
exec_mm_infer_runner(tasks, args, cfg) exec_mm_infer_runner(tasks, args, cfg)
return return
elif args.dlc or args.slurm or cfg.get('infer', None) is None:
# Use SizePartitioner to split into subtasks if args.dlc or args.slurm or cfg.get('infer', None) is None:
partitioner = SizePartitioner( fill_infer_cfg(cfg, args)
osp.join(cfg['work_dir'], 'predictions/'),
max_task_size=args.max_partition_size,
gen_task_coef=args.gen_task_coef)
tasks = partitioner(cfg)
if args.dry_run:
return
# execute the infer subtasks
exec_infer_runner(tasks, args, cfg)
# If they have specified "infer" in config and haven't used --slurm
# or --dlc, just follow the config
else:
if args.partition is not None: if args.partition is not None:
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner: if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
cfg.infer.runner.partition = args.partition cfg.infer.runner.partition = args.partition
@ -270,8 +259,8 @@ def main():
cfg.infer.runner.debug = True cfg.infer.runner.debug = True
if args.lark: if args.lark:
cfg.infer.runner.lark_bot_url = cfg['lark_bot_url'] cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
cfg.infer.partitioner['out_dir'] = osp.join( cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
cfg['work_dir'], 'predictions/') 'predictions/')
partitioner = PARTITIONERS.build(cfg.infer.partitioner) partitioner = PARTITIONERS.build(cfg.infer.partitioner)
tasks = partitioner(cfg) tasks = partitioner(cfg)
if args.dry_run: if args.dry_run:
@ -289,18 +278,10 @@ def main():
'also specified --slurm or --dlc. ' 'also specified --slurm or --dlc. '
'The "eval" configuration will be overridden by ' 'The "eval" configuration will be overridden by '
'your runtime arguments.') 'your runtime arguments.')
if args.dlc or args.slurm or cfg.get('eval', None) is None: if args.dlc or args.slurm or cfg.get('eval', None) is None:
# Use NaivePartitionernot split fill_eval_cfg(cfg, args)
partitioner = NaivePartitioner(
osp.join(cfg['work_dir'], 'results/'))
tasks = partitioner(cfg)
if args.dry_run:
return
# execute the eval tasks
exec_eval_runner(tasks, args, cfg)
# If they have specified "eval" in config and haven't used --slurm
# or --dlc, just follow the config
else:
if args.partition is not None: if args.partition is not None:
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner: if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
cfg.eval.runner.partition = args.partition cfg.eval.runner.partition = args.partition
@ -312,8 +293,7 @@ def main():
cfg.eval.runner.debug = True cfg.eval.runner.debug = True
if args.lark: if args.lark:
cfg.eval.runner.lark_bot_url = cfg['lark_bot_url'] cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
'results/')
partitioner = PARTITIONERS.build(cfg.eval.partitioner) partitioner = PARTITIONERS.build(cfg.eval.partitioner)
tasks = partitioner(cfg) tasks = partitioner(cfg)
if args.dry_run: if args.dry_run: