[Sync] Use finally to clean up temp files (#337)

2025-05-30 16:03:24 +08:00 · 2023-09-04 15:20:16 +08:00 · 2023-09-04 15:20:16 +08:00 · ce65d3393b
commit ce65d3393b
parent 2cd994c3d1
5 changed files with 233 additions and 230 deletions
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@ -86,65 +86,70 @@ class DLCRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_params.py'
-        task_cfg.dump(param_file)
+        try:
            task_cfg.dump(param_file)
-        # Build up DLC command
+            # Build up DLC command
-        pwd = os.getcwd()
+            pwd = os.getcwd()
-        shell_cmd = (f'source {self.aliyun_cfg["bashrc_path"]}; '
+            shell_cmd = (
-                     f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
+                f'source {self.aliyun_cfg["bashrc_path"]}; '
-                     f'cd {pwd}; '
+                f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
-                     '{task_cmd}')
+                f'cd {pwd}; '
                '{task_cmd}')
-        tmpl = ('dlc create job'
+            tmpl = ('dlc create job'
-                f" --command '{shell_cmd}'"
+                    f" --command '{shell_cmd}'"
-                f' --name {task_name[:512]}'
+                    f' --name {task_name[:512]}'
-                ' --kind BatchJob'
+                    ' --kind BatchJob'
-                f" -c {self.aliyun_cfg['dlc_config_path']}"
+                    f" -c {self.aliyun_cfg['dlc_config_path']}"
-                f" --workspace_id {self.aliyun_cfg['workspace_id']}"
+                    f" --workspace_id {self.aliyun_cfg['workspace_id']}"
-                ' --worker_count 1'
+                    ' --worker_count 1'
-                f' --worker_cpu {max(num_gpus * 6, 8)}'
+                    f' --worker_cpu {max(num_gpus * 6, 8)}'
-                f' --worker_gpu {num_gpus}'
+                    f' --worker_gpu {num_gpus}'
-                f' --worker_memory {max(num_gpus * 32, 48)}'
+                    f' --worker_memory {max(num_gpus * 32, 48)}'
-                f" --worker_image {self.aliyun_cfg['worker_image']}"
+                    f" --worker_image {self.aliyun_cfg['worker_image']}"
-                ' --interactive')
+                    ' --interactive')
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+            get_cmd = partial(task.get_command,
-        cmd = get_cmd()
+                              cfg_path=param_file,
                              template=tmpl)
            cmd = get_cmd()
-        logger = get_logger()
+            logger = get_logger()
-        logger.debug(f'Running command: {cmd}')
+            logger.debug(f'Running command: {cmd}')
-        # Run command with retry
+            # Run command with retry
-        if self.debug:
+            if self.debug:
-            stdout = None
+                stdout = None
-        else:
+            else:
-            out_path = task.get_log_path(file_extension='out')
+                out_path = task.get_log_path(file_extension='out')
-            mmengine.mkdir_or_exist(osp.split(out_path)[0])
+                mmengine.mkdir_or_exist(osp.split(out_path)[0])
-            stdout = open(out_path, 'w', encoding='utf-8')
+                stdout = open(out_path, 'w', encoding='utf-8')
        if random_sleep:
            time.sleep(random.randint(0, 10))
        result = subprocess.run(cmd,
                                shell=True,
                                text=True,
                                stdout=stdout,
                                stderr=stdout)
        retry = self.retry
        output_paths = task.get_output_paths()
        while self._job_failed(result.returncode, output_paths) and retry > 0:
            retry -= 1
            if random_sleep:
                time.sleep(random.randint(0, 10))
            # Re-generate command to refresh ports.
            cmd = get_cmd()
            result = subprocess.run(cmd,
                                    shell=True,
                                    text=True,
                                    stdout=stdout,
                                    stderr=stdout)
-        # Clean up
+            retry = self.retry
-        os.remove(param_file)
+            output_paths = task.get_output_paths()
            while self._job_failed(result.returncode,
                                   output_paths) and retry > 0:
                retry -= 1
                if random_sleep:
                    time.sleep(random.randint(0, 10))
                # Re-generate command to refresh ports.
                cmd = get_cmd()
                result = subprocess.run(cmd,
                                        shell=True,
                                        text=True,
                                        stdout=stdout,
                                        stderr=stdout)
        finally:
            # Clean up
            os.remove(param_file)
        return task_name, result.returncode
    def _job_failed(self, return_code: int, output_paths: List[str]) -> bool:
--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@ -62,15 +62,17 @@ class LocalRunner(BaseRunner):
                # get cmd
                mmengine.mkdir_or_exist('tmp/')
                param_file = f'tmp/{os.getpid()}_params.py'
-                task.cfg.dump(param_file)
+                try:
-                cmd = task.get_command(cfg_path=param_file,
+                    task.cfg.dump(param_file)
-                                       template='{task_cmd}')
+                    cmd = task.get_command(cfg_path=param_file,
-                # run in subprocess if starts with torchrun etc.
+                                           template='{task_cmd}')
-                if cmd.startswith('python'):
+                    # run in subprocess if starts with torchrun etc.
-                    task.run()
+                    if cmd.startswith('python'):
-                else:
+                        task.run()
-                    subprocess.run(cmd, shell=True, text=True)
+                    else:
-                os.remove(param_file)
+                        subprocess.run(cmd, shell=True, text=True)
                finally:
                    os.remove(param_file)
                status.append((task_name, 0))
        else:
            import torch
@ -141,31 +143,34 @@ class LocalRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_{index}_params.py'
-        task.cfg.dump(param_file)
+        try:
            task.cfg.dump(param_file)
-        # Build up slurm command
+            # Build up slurm command
-        tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
+            tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
-        tmpl += ' {task_cmd}'
+            tmpl += ' {task_cmd}'
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+            get_cmd = partial(task.get_command,
-        cmd = get_cmd()
+                              cfg_path=param_file,
                              template=tmpl)
            cmd = get_cmd()
-        logger = get_logger()
+            logger = get_logger()
-        logger.debug(f'Running command: {cmd}')
+            logger.debug(f'Running command: {cmd}')
-        # Run command
+            # Run command
-        out_path = task.get_log_path(file_extension='out')
+            out_path = task.get_log_path(file_extension='out')
-        mmengine.mkdir_or_exist(osp.split(out_path)[0])
+            mmengine.mkdir_or_exist(osp.split(out_path)[0])
-        stdout = open(out_path, 'w', encoding='utf-8')
+            stdout = open(out_path, 'w', encoding='utf-8')
-        result = subprocess.run(cmd,
+            result = subprocess.run(cmd,
-                                shell=True,
+                                    shell=True,
-                                text=True,
+                                    text=True,
-                                stdout=stdout,
+                                    stdout=stdout,
-                                stderr=stdout)
+                                    stderr=stdout)
-        if result.returncode != 0:
+            if result.returncode != 0:
-            logger.warning(f'task {task_name} fail, see\n{out_path}')
+                logger.warning(f'task {task_name} fail, see\n{out_path}')
-
+        finally:
-        # Clean up
+            # Clean up
-        os.remove(param_file)
+            os.remove(param_file)
        return task_name, result.returncode
--- a/opencompass/runners/slurm.py
+++ b/opencompass/runners/slurm.py
@ -91,60 +91,64 @@ class SlurmRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_params.py'
-        task_cfg.dump(param_file)
+        try:
            task_cfg.dump(param_file)
-        # Build up slurm command
+            # Build up slurm command
-        tmpl = 'srun'
+            tmpl = 'srun'
-        if self.partition:
+            if self.partition:
-            tmpl += f' -p {self.partition}'
+                tmpl += f' -p {self.partition}'
-        if self.quotatype:
+            if self.quotatype:
-            tmpl += f' --quotatype={self.quotatype}'
+                tmpl += f' --quotatype={self.quotatype}'
-        if self.qos:
+            if self.qos:
-            tmpl += f' --qos={self.qos}'
+                tmpl += f' --qos={self.qos}'
-        if num_gpus > 0:
+            if num_gpus > 0:
-            tmpl += f' --gres=gpu:{num_gpus}'
+                tmpl += f' --gres=gpu:{num_gpus}'
-        tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
+            tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+            get_cmd = partial(task.get_command,
-        cmd = get_cmd()
+                              cfg_path=param_file,
                              template=tmpl)
            cmd = get_cmd()
-        logger = get_logger()
+            logger = get_logger()
-        logger.debug(f'Running command: {cmd}')
+            logger.debug(f'Running command: {cmd}')
-        # Run command with retry
+            # Run command with retry
-        if self.debug:
+            if self.debug:
-            stdout = None
+                stdout = None
-        else:
+            else:
-            out_path = task.get_log_path(file_extension='out')
+                out_path = task.get_log_path(file_extension='out')
-            mmengine.mkdir_or_exist(osp.split(out_path)[0])
+                mmengine.mkdir_or_exist(osp.split(out_path)[0])
-            stdout = open(out_path, 'w', encoding='utf-8')
+                stdout = open(out_path, 'w', encoding='utf-8')
        if random_sleep:
            time.sleep(random.randint(0, 10))
        result = subprocess.run(cmd,
                                shell=True,
                                text=True,
                                stdout=stdout,
                                stderr=stdout)
        retry = self.retry
        output_paths = task.get_output_paths()
        while self._job_failed(result.returncode, output_paths) and retry > 0:
            retry -= 1
            if random_sleep:
                time.sleep(random.randint(0, 10))
            # Re-generate command to refresh ports.
            cmd = get_cmd()
            result = subprocess.run(cmd,
                                    shell=True,
                                    text=True,
                                    stdout=stdout,
                                    stderr=stdout)
-        if result.returncode != 0 and not self.debug:
+            retry = self.retry
-            logger.warning(f'task {task_name} fail, see\n{out_path}')
+            output_paths = task.get_output_paths()
            while self._job_failed(result.returncode,
                                   output_paths) and retry > 0:
                retry -= 1
                if random_sleep:
                    time.sleep(random.randint(0, 10))
                # Re-generate command to refresh ports.
                cmd = get_cmd()
                result = subprocess.run(cmd,
                                        shell=True,
                                        text=True,
                                        stdout=stdout,
                                        stderr=stdout)
-        # Clean up
+            if result.returncode != 0 and not self.debug:
-        os.remove(param_file)
+                logger.warning(f'task {task_name} fail, see\n{out_path}')
        finally:
            # Clean up
            os.remove(param_file)
        return task_name, result.returncode
    def _job_failed(self, return_code: int, output_paths: List[str]) -> bool:
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@ -3,7 +3,9 @@ from typing import List, Union
 import tabulate
 from mmengine.config import Config
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
 from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
 from opencompass.utils import get_logger, match_files
@ -118,54 +120,61 @@ def exec_mm_infer_runner(tasks, args, cfg):
    runner(tasks)
-def exec_infer_runner(tasks, args, cfg):
+def get_config_type(obj) -> str:
-    """execute infer runner according to args."""
+    return f'{obj.__module__}.{obj.__name__}'
    if args.slurm:
        runner = SlurmRunner(dict(type='OpenICLInferTask'),
                             max_num_workers=args.max_num_workers,
                             partition=args.partition,
                             quotatype=args.quotatype,
                             qos=args.qos,
                             retry=args.retry,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
    elif args.dlc:
        runner = DLCRunner(dict(type='OpenICLInferTask'),
                           max_num_workers=args.max_num_workers,
                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
                           retry=args.retry,
                           debug=args.debug,
                           lark_bot_url=cfg['lark_bot_url'])
    else:
        runner = LocalRunner(task=dict(type='OpenICLInferTask'),
                             max_num_workers=args.max_num_workers,
                             max_workers_per_gpu=args.max_workers_per_gpu,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
    runner(tasks)
-def exec_eval_runner(tasks, args, cfg):
+def fill_infer_cfg(cfg, args):
-    """execute infer runner according to args."""
+    new_cfg = dict(infer=dict(
        partitioner=dict(type=get_config_type(SizePartitioner),
                         max_task_size=args.max_partition_size,
                         gen_task_coef=args.gen_task_coef),
        runner=dict(
            max_num_workers=args.max_num_workers,
            debug=args.debug,
            task=dict(type=get_config_type(OpenICLInferTask)),
            lark_bot_url=cfg['lark_bot_url'],
        )), )
    if args.slurm:
-        runner = SlurmRunner(dict(type='OpenICLEvalTask'),
+        new_cfg['infer']['runner']['type'] = get_config_type(SlurmRunner)
-                             max_num_workers=args.max_num_workers,
+        new_cfg['infer']['runner']['partition'] = args.partition
-                             partition=args.partition,
+        new_cfg['infer']['runner']['quotatype'] = args.quotatype
-                             quotatype=args.quotatype,
+        new_cfg['infer']['runner']['qos'] = args.qos
-                             qos=args.qos,
+        new_cfg['infer']['runner']['retry'] = args.retry
                             retry=args.retry,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
    elif args.dlc:
-        runner = DLCRunner(dict(type='OpenICLEvalTask'),
+        new_cfg['infer']['runner']['type'] = get_config_type(DLCRunner)
-                           max_num_workers=args.max_num_workers,
+        new_cfg['infer']['runner']['aliyun_cfg'] = Config.fromfile(
-                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
+            args.aliyun_cfg)
-                           retry=args.retry,
+        new_cfg['infer']['runner']['retry'] = args.retry
                           debug=args.debug,
                           lark_bot_url=cfg['lark_bot_url'])
    else:
-        runner = LocalRunner(task=dict(type='OpenICLEvalTask'),
+        new_cfg['infer']['runner']['type'] = get_config_type(LocalRunner)
-                             max_num_workers=args.max_num_workers,
+        new_cfg['infer']['runner'][
-                             debug=args.debug,
+            'max_workers_per_gpu'] = args.max_workers_per_gpu
-                             lark_bot_url=cfg['lark_bot_url'])
+    cfg.merge_from_dict(new_cfg)
-    runner(tasks)
+
 def fill_eval_cfg(cfg, args):
    new_cfg = dict(
        eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
                  runner=dict(
                      max_num_workers=args.max_num_workers,
                      debug=args.debug,
                      task=dict(type=get_config_type(OpenICLEvalTask)),
                      lark_bot_url=cfg['lark_bot_url'],
                  )))
    if args.slurm:
        new_cfg['eval']['runner']['type'] = get_config_type(SlurmRunner)
        new_cfg['eval']['runner']['partition'] = args.partition
        new_cfg['eval']['runner']['quotatype'] = args.quotatype
        new_cfg['eval']['runner']['qos'] = args.qos
        new_cfg['eval']['runner']['retry'] = args.retry
    elif args.dlc:
        new_cfg['eval']['runner']['type'] = get_config_type(DLCRunner)
        new_cfg['eval']['runner']['aliyun_cfg'] = Config.fromfile(
            args.aliyun_cfg)
        new_cfg['eval']['runner']['retry'] = args.retry
    else:
        new_cfg['eval']['runner']['type'] = get_config_type(LocalRunner)
        new_cfg['eval']['runner'][
            'max_workers_per_gpu'] = args.max_workers_per_gpu
    cfg.merge_from_dict(new_cfg)
--- a/run.py
+++ b/run.py
@ -6,13 +6,12 @@ from datetime import datetime
 from mmengine.config import Config, DictAction
-from opencompass.partitioners import (MultimodalNaivePartitioner,
+from opencompass.partitioners import MultimodalNaivePartitioner
                                      NaivePartitioner, SizePartitioner)
 from opencompass.registry import PARTITIONERS, RUNNERS
 from opencompass.runners import SlurmRunner
 from opencompass.utils import LarkReporter, Summarizer, get_logger
-from opencompass.utils.run import (exec_eval_runner, exec_infer_runner,
+from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg,
-                                   exec_mm_infer_runner, get_config_from_arg)
+                                   fill_infer_cfg, get_config_from_arg)
 def parse_args():
@ -245,39 +244,29 @@ def main():
            tasks = partitioner(cfg)
            exec_mm_infer_runner(tasks, args, cfg)
            return
-        elif args.dlc or args.slurm or cfg.get('infer', None) is None:
+
-            # Use SizePartitioner to split into subtasks
+        if args.dlc or args.slurm or cfg.get('infer', None) is None:
-            partitioner = SizePartitioner(
+            fill_infer_cfg(cfg, args)
-                osp.join(cfg['work_dir'], 'predictions/'),
+
-                max_task_size=args.max_partition_size,
+        if args.partition is not None:
-                gen_task_coef=args.gen_task_coef)
+            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
-            tasks = partitioner(cfg)
+                cfg.infer.runner.partition = args.partition
-            if args.dry_run:
+                cfg.infer.runner.quotatype = args.quotatype
                return
            # execute the infer subtasks
            exec_infer_runner(tasks, args, cfg)
        # If they have specified "infer" in config and haven't used --slurm
        # or --dlc, just follow the config
        else:
-            if args.partition is not None:
+            logger.warning('SlurmRunner is not used, so the partition '
-                if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
+                           'argument is ignored.')
-                    cfg.infer.runner.partition = args.partition
+        if args.debug:
-                    cfg.infer.runner.quotatype = args.quotatype
+            cfg.infer.runner.debug = True
-            else:
+        if args.lark:
-                logger.warning('SlurmRunner is not used, so the partition '
+            cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
-                               'argument is ignored.')
+        cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
-            if args.debug:
+                                                    'predictions/')
-                cfg.infer.runner.debug = True
+        partitioner = PARTITIONERS.build(cfg.infer.partitioner)
-            if args.lark:
+        tasks = partitioner(cfg)
-                cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
+        if args.dry_run:
-            cfg.infer.partitioner['out_dir'] = osp.join(
+            return
-                cfg['work_dir'], 'predictions/')
+        runner = RUNNERS.build(cfg.infer.runner)
-            partitioner = PARTITIONERS.build(cfg.infer.partitioner)
+        runner(tasks)
            tasks = partitioner(cfg)
            if args.dry_run:
                return
            runner = RUNNERS.build(cfg.infer.runner)
            runner(tasks)
    # evaluate
    if args.mode in ['all', 'eval']:
@ -289,37 +278,28 @@ def main():
                           'also specified --slurm or --dlc. '
                           'The "eval" configuration will be overridden by '
                           'your runtime arguments.')
        if args.dlc or args.slurm or cfg.get('eval', None) is None:
-            # Use NaivePartitioner，not split
+            fill_eval_cfg(cfg, args)
-            partitioner = NaivePartitioner(
+
-                osp.join(cfg['work_dir'], 'results/'))
+        if args.partition is not None:
-            tasks = partitioner(cfg)
+            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
-            if args.dry_run:
+                cfg.eval.runner.partition = args.partition
-                return
+                cfg.eval.runner.quotatype = args.quotatype
-            # execute the eval tasks
+            else:
-            exec_eval_runner(tasks, args, cfg)
+                logger.warning('SlurmRunner is not used, so the partition '
-        # If they have specified "eval" in config and haven't used --slurm
+                               'argument is ignored.')
-        # or --dlc, just follow the config
+        if args.debug:
-        else:
+            cfg.eval.runner.debug = True
-            if args.partition is not None:
+        if args.lark:
-                if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
+            cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
-                    cfg.eval.runner.partition = args.partition
+        cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
-                    cfg.eval.runner.quotatype = args.quotatype
+        partitioner = PARTITIONERS.build(cfg.eval.partitioner)
-                else:
+        tasks = partitioner(cfg)
-                    logger.warning('SlurmRunner is not used, so the partition '
+        if args.dry_run:
-                                   'argument is ignored.')
+            return
-            if args.debug:
+        runner = RUNNERS.build(cfg.eval.runner)
-                cfg.eval.runner.debug = True
+        runner(tasks)
            if args.lark:
                cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
            cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'],
                                                       'results/')
            partitioner = PARTITIONERS.build(cfg.eval.partitioner)
            tasks = partitioner(cfg)
            if args.dry_run:
                return
            runner = RUNNERS.build(cfg.eval.runner)
            runner(tasks)
    # visualize
    if args.mode in ['all', 'eval', 'viz']: