mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Sync] Use finally to clean up temp files (#337)
This commit is contained in:
parent
2cd994c3d1
commit
ce65d3393b
@ -86,11 +86,13 @@ class DLCRunner(BaseRunner):
|
|||||||
# Dump task config to file
|
# Dump task config to file
|
||||||
mmengine.mkdir_or_exist('tmp/')
|
mmengine.mkdir_or_exist('tmp/')
|
||||||
param_file = f'tmp/{os.getpid()}_params.py'
|
param_file = f'tmp/{os.getpid()}_params.py'
|
||||||
|
try:
|
||||||
task_cfg.dump(param_file)
|
task_cfg.dump(param_file)
|
||||||
|
|
||||||
# Build up DLC command
|
# Build up DLC command
|
||||||
pwd = os.getcwd()
|
pwd = os.getcwd()
|
||||||
shell_cmd = (f'source {self.aliyun_cfg["bashrc_path"]}; '
|
shell_cmd = (
|
||||||
|
f'source {self.aliyun_cfg["bashrc_path"]}; '
|
||||||
f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
|
f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
|
||||||
f'cd {pwd}; '
|
f'cd {pwd}; '
|
||||||
'{task_cmd}')
|
'{task_cmd}')
|
||||||
@ -107,7 +109,9 @@ class DLCRunner(BaseRunner):
|
|||||||
f' --worker_memory {max(num_gpus * 32, 48)}'
|
f' --worker_memory {max(num_gpus * 32, 48)}'
|
||||||
f" --worker_image {self.aliyun_cfg['worker_image']}"
|
f" --worker_image {self.aliyun_cfg['worker_image']}"
|
||||||
' --interactive')
|
' --interactive')
|
||||||
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
|
get_cmd = partial(task.get_command,
|
||||||
|
cfg_path=param_file,
|
||||||
|
template=tmpl)
|
||||||
cmd = get_cmd()
|
cmd = get_cmd()
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
@ -131,7 +135,8 @@ class DLCRunner(BaseRunner):
|
|||||||
|
|
||||||
retry = self.retry
|
retry = self.retry
|
||||||
output_paths = task.get_output_paths()
|
output_paths = task.get_output_paths()
|
||||||
while self._job_failed(result.returncode, output_paths) and retry > 0:
|
while self._job_failed(result.returncode,
|
||||||
|
output_paths) and retry > 0:
|
||||||
retry -= 1
|
retry -= 1
|
||||||
if random_sleep:
|
if random_sleep:
|
||||||
time.sleep(random.randint(0, 10))
|
time.sleep(random.randint(0, 10))
|
||||||
@ -142,7 +147,7 @@ class DLCRunner(BaseRunner):
|
|||||||
text=True,
|
text=True,
|
||||||
stdout=stdout,
|
stdout=stdout,
|
||||||
stderr=stdout)
|
stderr=stdout)
|
||||||
|
finally:
|
||||||
# Clean up
|
# Clean up
|
||||||
os.remove(param_file)
|
os.remove(param_file)
|
||||||
return task_name, result.returncode
|
return task_name, result.returncode
|
||||||
|
@ -62,6 +62,7 @@ class LocalRunner(BaseRunner):
|
|||||||
# get cmd
|
# get cmd
|
||||||
mmengine.mkdir_or_exist('tmp/')
|
mmengine.mkdir_or_exist('tmp/')
|
||||||
param_file = f'tmp/{os.getpid()}_params.py'
|
param_file = f'tmp/{os.getpid()}_params.py'
|
||||||
|
try:
|
||||||
task.cfg.dump(param_file)
|
task.cfg.dump(param_file)
|
||||||
cmd = task.get_command(cfg_path=param_file,
|
cmd = task.get_command(cfg_path=param_file,
|
||||||
template='{task_cmd}')
|
template='{task_cmd}')
|
||||||
@ -70,6 +71,7 @@ class LocalRunner(BaseRunner):
|
|||||||
task.run()
|
task.run()
|
||||||
else:
|
else:
|
||||||
subprocess.run(cmd, shell=True, text=True)
|
subprocess.run(cmd, shell=True, text=True)
|
||||||
|
finally:
|
||||||
os.remove(param_file)
|
os.remove(param_file)
|
||||||
status.append((task_name, 0))
|
status.append((task_name, 0))
|
||||||
else:
|
else:
|
||||||
@ -141,12 +143,15 @@ class LocalRunner(BaseRunner):
|
|||||||
# Dump task config to file
|
# Dump task config to file
|
||||||
mmengine.mkdir_or_exist('tmp/')
|
mmengine.mkdir_or_exist('tmp/')
|
||||||
param_file = f'tmp/{os.getpid()}_{index}_params.py'
|
param_file = f'tmp/{os.getpid()}_{index}_params.py'
|
||||||
|
try:
|
||||||
task.cfg.dump(param_file)
|
task.cfg.dump(param_file)
|
||||||
|
|
||||||
# Build up slurm command
|
# Build up slurm command
|
||||||
tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
|
tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
|
||||||
tmpl += ' {task_cmd}'
|
tmpl += ' {task_cmd}'
|
||||||
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
|
get_cmd = partial(task.get_command,
|
||||||
|
cfg_path=param_file,
|
||||||
|
template=tmpl)
|
||||||
cmd = get_cmd()
|
cmd = get_cmd()
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
@ -165,7 +170,7 @@ class LocalRunner(BaseRunner):
|
|||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
logger.warning(f'task {task_name} fail, see\n{out_path}')
|
logger.warning(f'task {task_name} fail, see\n{out_path}')
|
||||||
|
finally:
|
||||||
# Clean up
|
# Clean up
|
||||||
os.remove(param_file)
|
os.remove(param_file)
|
||||||
return task_name, result.returncode
|
return task_name, result.returncode
|
||||||
|
@ -91,6 +91,7 @@ class SlurmRunner(BaseRunner):
|
|||||||
# Dump task config to file
|
# Dump task config to file
|
||||||
mmengine.mkdir_or_exist('tmp/')
|
mmengine.mkdir_or_exist('tmp/')
|
||||||
param_file = f'tmp/{os.getpid()}_params.py'
|
param_file = f'tmp/{os.getpid()}_params.py'
|
||||||
|
try:
|
||||||
task_cfg.dump(param_file)
|
task_cfg.dump(param_file)
|
||||||
|
|
||||||
# Build up slurm command
|
# Build up slurm command
|
||||||
@ -104,7 +105,9 @@ class SlurmRunner(BaseRunner):
|
|||||||
if num_gpus > 0:
|
if num_gpus > 0:
|
||||||
tmpl += f' --gres=gpu:{num_gpus}'
|
tmpl += f' --gres=gpu:{num_gpus}'
|
||||||
tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
|
tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
|
||||||
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
|
get_cmd = partial(task.get_command,
|
||||||
|
cfg_path=param_file,
|
||||||
|
template=tmpl)
|
||||||
cmd = get_cmd()
|
cmd = get_cmd()
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
@ -128,7 +131,8 @@ class SlurmRunner(BaseRunner):
|
|||||||
|
|
||||||
retry = self.retry
|
retry = self.retry
|
||||||
output_paths = task.get_output_paths()
|
output_paths = task.get_output_paths()
|
||||||
while self._job_failed(result.returncode, output_paths) and retry > 0:
|
while self._job_failed(result.returncode,
|
||||||
|
output_paths) and retry > 0:
|
||||||
retry -= 1
|
retry -= 1
|
||||||
if random_sleep:
|
if random_sleep:
|
||||||
time.sleep(random.randint(0, 10))
|
time.sleep(random.randint(0, 10))
|
||||||
@ -142,7 +146,7 @@ class SlurmRunner(BaseRunner):
|
|||||||
|
|
||||||
if result.returncode != 0 and not self.debug:
|
if result.returncode != 0 and not self.debug:
|
||||||
logger.warning(f'task {task_name} fail, see\n{out_path}')
|
logger.warning(f'task {task_name} fail, see\n{out_path}')
|
||||||
|
finally:
|
||||||
# Clean up
|
# Clean up
|
||||||
os.remove(param_file)
|
os.remove(param_file)
|
||||||
return task_name, result.returncode
|
return task_name, result.returncode
|
||||||
|
@ -3,7 +3,9 @@ from typing import List, Union
|
|||||||
import tabulate
|
import tabulate
|
||||||
from mmengine.config import Config
|
from mmengine.config import Config
|
||||||
|
|
||||||
|
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||||
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
|
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
|
||||||
|
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
||||||
from opencompass.utils import get_logger, match_files
|
from opencompass.utils import get_logger, match_files
|
||||||
|
|
||||||
|
|
||||||
@ -118,54 +120,61 @@ def exec_mm_infer_runner(tasks, args, cfg):
|
|||||||
runner(tasks)
|
runner(tasks)
|
||||||
|
|
||||||
|
|
||||||
def exec_infer_runner(tasks, args, cfg):
|
def get_config_type(obj) -> str:
|
||||||
"""execute infer runner according to args."""
|
return f'{obj.__module__}.{obj.__name__}'
|
||||||
if args.slurm:
|
|
||||||
runner = SlurmRunner(dict(type='OpenICLInferTask'),
|
|
||||||
max_num_workers=args.max_num_workers,
|
|
||||||
partition=args.partition,
|
|
||||||
quotatype=args.quotatype,
|
|
||||||
qos=args.qos,
|
|
||||||
retry=args.retry,
|
|
||||||
debug=args.debug,
|
|
||||||
lark_bot_url=cfg['lark_bot_url'])
|
|
||||||
elif args.dlc:
|
|
||||||
runner = DLCRunner(dict(type='OpenICLInferTask'),
|
|
||||||
max_num_workers=args.max_num_workers,
|
|
||||||
aliyun_cfg=Config.fromfile(args.aliyun_cfg),
|
|
||||||
retry=args.retry,
|
|
||||||
debug=args.debug,
|
|
||||||
lark_bot_url=cfg['lark_bot_url'])
|
|
||||||
else:
|
|
||||||
runner = LocalRunner(task=dict(type='OpenICLInferTask'),
|
|
||||||
max_num_workers=args.max_num_workers,
|
|
||||||
max_workers_per_gpu=args.max_workers_per_gpu,
|
|
||||||
debug=args.debug,
|
|
||||||
lark_bot_url=cfg['lark_bot_url'])
|
|
||||||
runner(tasks)
|
|
||||||
|
|
||||||
|
|
||||||
def exec_eval_runner(tasks, args, cfg):
|
def fill_infer_cfg(cfg, args):
|
||||||
"""execute infer runner according to args."""
|
new_cfg = dict(infer=dict(
|
||||||
|
partitioner=dict(type=get_config_type(SizePartitioner),
|
||||||
|
max_task_size=args.max_partition_size,
|
||||||
|
gen_task_coef=args.gen_task_coef),
|
||||||
|
runner=dict(
|
||||||
|
max_num_workers=args.max_num_workers,
|
||||||
|
debug=args.debug,
|
||||||
|
task=dict(type=get_config_type(OpenICLInferTask)),
|
||||||
|
lark_bot_url=cfg['lark_bot_url'],
|
||||||
|
)), )
|
||||||
if args.slurm:
|
if args.slurm:
|
||||||
runner = SlurmRunner(dict(type='OpenICLEvalTask'),
|
new_cfg['infer']['runner']['type'] = get_config_type(SlurmRunner)
|
||||||
max_num_workers=args.max_num_workers,
|
new_cfg['infer']['runner']['partition'] = args.partition
|
||||||
partition=args.partition,
|
new_cfg['infer']['runner']['quotatype'] = args.quotatype
|
||||||
quotatype=args.quotatype,
|
new_cfg['infer']['runner']['qos'] = args.qos
|
||||||
qos=args.qos,
|
new_cfg['infer']['runner']['retry'] = args.retry
|
||||||
retry=args.retry,
|
|
||||||
debug=args.debug,
|
|
||||||
lark_bot_url=cfg['lark_bot_url'])
|
|
||||||
elif args.dlc:
|
elif args.dlc:
|
||||||
runner = DLCRunner(dict(type='OpenICLEvalTask'),
|
new_cfg['infer']['runner']['type'] = get_config_type(DLCRunner)
|
||||||
max_num_workers=args.max_num_workers,
|
new_cfg['infer']['runner']['aliyun_cfg'] = Config.fromfile(
|
||||||
aliyun_cfg=Config.fromfile(args.aliyun_cfg),
|
args.aliyun_cfg)
|
||||||
retry=args.retry,
|
new_cfg['infer']['runner']['retry'] = args.retry
|
||||||
debug=args.debug,
|
|
||||||
lark_bot_url=cfg['lark_bot_url'])
|
|
||||||
else:
|
else:
|
||||||
runner = LocalRunner(task=dict(type='OpenICLEvalTask'),
|
new_cfg['infer']['runner']['type'] = get_config_type(LocalRunner)
|
||||||
|
new_cfg['infer']['runner'][
|
||||||
|
'max_workers_per_gpu'] = args.max_workers_per_gpu
|
||||||
|
cfg.merge_from_dict(new_cfg)
|
||||||
|
|
||||||
|
|
||||||
|
def fill_eval_cfg(cfg, args):
|
||||||
|
new_cfg = dict(
|
||||||
|
eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
|
||||||
|
runner=dict(
|
||||||
max_num_workers=args.max_num_workers,
|
max_num_workers=args.max_num_workers,
|
||||||
debug=args.debug,
|
debug=args.debug,
|
||||||
lark_bot_url=cfg['lark_bot_url'])
|
task=dict(type=get_config_type(OpenICLEvalTask)),
|
||||||
runner(tasks)
|
lark_bot_url=cfg['lark_bot_url'],
|
||||||
|
)))
|
||||||
|
if args.slurm:
|
||||||
|
new_cfg['eval']['runner']['type'] = get_config_type(SlurmRunner)
|
||||||
|
new_cfg['eval']['runner']['partition'] = args.partition
|
||||||
|
new_cfg['eval']['runner']['quotatype'] = args.quotatype
|
||||||
|
new_cfg['eval']['runner']['qos'] = args.qos
|
||||||
|
new_cfg['eval']['runner']['retry'] = args.retry
|
||||||
|
elif args.dlc:
|
||||||
|
new_cfg['eval']['runner']['type'] = get_config_type(DLCRunner)
|
||||||
|
new_cfg['eval']['runner']['aliyun_cfg'] = Config.fromfile(
|
||||||
|
args.aliyun_cfg)
|
||||||
|
new_cfg['eval']['runner']['retry'] = args.retry
|
||||||
|
else:
|
||||||
|
new_cfg['eval']['runner']['type'] = get_config_type(LocalRunner)
|
||||||
|
new_cfg['eval']['runner'][
|
||||||
|
'max_workers_per_gpu'] = args.max_workers_per_gpu
|
||||||
|
cfg.merge_from_dict(new_cfg)
|
||||||
|
46
run.py
46
run.py
@ -6,13 +6,12 @@ from datetime import datetime
|
|||||||
|
|
||||||
from mmengine.config import Config, DictAction
|
from mmengine.config import Config, DictAction
|
||||||
|
|
||||||
from opencompass.partitioners import (MultimodalNaivePartitioner,
|
from opencompass.partitioners import MultimodalNaivePartitioner
|
||||||
NaivePartitioner, SizePartitioner)
|
|
||||||
from opencompass.registry import PARTITIONERS, RUNNERS
|
from opencompass.registry import PARTITIONERS, RUNNERS
|
||||||
from opencompass.runners import SlurmRunner
|
from opencompass.runners import SlurmRunner
|
||||||
from opencompass.utils import LarkReporter, Summarizer, get_logger
|
from opencompass.utils import LarkReporter, Summarizer, get_logger
|
||||||
from opencompass.utils.run import (exec_eval_runner, exec_infer_runner,
|
from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg,
|
||||||
exec_mm_infer_runner, get_config_from_arg)
|
fill_infer_cfg, get_config_from_arg)
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
@ -245,20 +244,10 @@ def main():
|
|||||||
tasks = partitioner(cfg)
|
tasks = partitioner(cfg)
|
||||||
exec_mm_infer_runner(tasks, args, cfg)
|
exec_mm_infer_runner(tasks, args, cfg)
|
||||||
return
|
return
|
||||||
elif args.dlc or args.slurm or cfg.get('infer', None) is None:
|
|
||||||
# Use SizePartitioner to split into subtasks
|
if args.dlc or args.slurm or cfg.get('infer', None) is None:
|
||||||
partitioner = SizePartitioner(
|
fill_infer_cfg(cfg, args)
|
||||||
osp.join(cfg['work_dir'], 'predictions/'),
|
|
||||||
max_task_size=args.max_partition_size,
|
|
||||||
gen_task_coef=args.gen_task_coef)
|
|
||||||
tasks = partitioner(cfg)
|
|
||||||
if args.dry_run:
|
|
||||||
return
|
|
||||||
# execute the infer subtasks
|
|
||||||
exec_infer_runner(tasks, args, cfg)
|
|
||||||
# If they have specified "infer" in config and haven't used --slurm
|
|
||||||
# or --dlc, just follow the config
|
|
||||||
else:
|
|
||||||
if args.partition is not None:
|
if args.partition is not None:
|
||||||
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
|
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
|
||||||
cfg.infer.runner.partition = args.partition
|
cfg.infer.runner.partition = args.partition
|
||||||
@ -270,8 +259,8 @@ def main():
|
|||||||
cfg.infer.runner.debug = True
|
cfg.infer.runner.debug = True
|
||||||
if args.lark:
|
if args.lark:
|
||||||
cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
|
cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
|
||||||
cfg.infer.partitioner['out_dir'] = osp.join(
|
cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
|
||||||
cfg['work_dir'], 'predictions/')
|
'predictions/')
|
||||||
partitioner = PARTITIONERS.build(cfg.infer.partitioner)
|
partitioner = PARTITIONERS.build(cfg.infer.partitioner)
|
||||||
tasks = partitioner(cfg)
|
tasks = partitioner(cfg)
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
@ -289,18 +278,10 @@ def main():
|
|||||||
'also specified --slurm or --dlc. '
|
'also specified --slurm or --dlc. '
|
||||||
'The "eval" configuration will be overridden by '
|
'The "eval" configuration will be overridden by '
|
||||||
'your runtime arguments.')
|
'your runtime arguments.')
|
||||||
|
|
||||||
if args.dlc or args.slurm or cfg.get('eval', None) is None:
|
if args.dlc or args.slurm or cfg.get('eval', None) is None:
|
||||||
# Use NaivePartitioner,not split
|
fill_eval_cfg(cfg, args)
|
||||||
partitioner = NaivePartitioner(
|
|
||||||
osp.join(cfg['work_dir'], 'results/'))
|
|
||||||
tasks = partitioner(cfg)
|
|
||||||
if args.dry_run:
|
|
||||||
return
|
|
||||||
# execute the eval tasks
|
|
||||||
exec_eval_runner(tasks, args, cfg)
|
|
||||||
# If they have specified "eval" in config and haven't used --slurm
|
|
||||||
# or --dlc, just follow the config
|
|
||||||
else:
|
|
||||||
if args.partition is not None:
|
if args.partition is not None:
|
||||||
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
|
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
|
||||||
cfg.eval.runner.partition = args.partition
|
cfg.eval.runner.partition = args.partition
|
||||||
@ -312,8 +293,7 @@ def main():
|
|||||||
cfg.eval.runner.debug = True
|
cfg.eval.runner.debug = True
|
||||||
if args.lark:
|
if args.lark:
|
||||||
cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
|
cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
|
||||||
cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'],
|
cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
|
||||||
'results/')
|
|
||||||
partitioner = PARTITIONERS.build(cfg.eval.partitioner)
|
partitioner = PARTITIONERS.build(cfg.eval.partitioner)
|
||||||
tasks = partitioner(cfg)
|
tasks = partitioner(cfg)
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
|
Loading…
Reference in New Issue
Block a user