[Sync] Use finally to clean up temp files (#337)

2025-05-30 16:03:24 +08:00 · 2023-09-04 15:20:16 +08:00 · 2023-09-04 15:20:16 +08:00 · ce65d3393b
commit ce65d3393b
parent 2cd994c3d1
5 changed files with 233 additions and 230 deletions
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@ -86,11 +86,13 @@ class DLCRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_params.py'
        try:
            task_cfg.dump(param_file)
            # Build up DLC command
            pwd = os.getcwd()
-        shell_cmd = (f'source {self.aliyun_cfg["bashrc_path"]}; '
+            shell_cmd = (
                f'source {self.aliyun_cfg["bashrc_path"]}; '
                f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
                f'cd {pwd}; '
                '{task_cmd}')
@ -107,7 +109,9 @@ class DLCRunner(BaseRunner):
                    f' --worker_memory {max(num_gpus * 32, 48)}'
                    f" --worker_image {self.aliyun_cfg['worker_image']}"
                    ' --interactive')
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+            get_cmd = partial(task.get_command,
                              cfg_path=param_file,
                              template=tmpl)
            cmd = get_cmd()
            logger = get_logger()
@ -131,7 +135,8 @@ class DLCRunner(BaseRunner):
            retry = self.retry
            output_paths = task.get_output_paths()
-        while self._job_failed(result.returncode, output_paths) and retry > 0:
+            while self._job_failed(result.returncode,
                                   output_paths) and retry > 0:
                retry -= 1
                if random_sleep:
                    time.sleep(random.randint(0, 10))
@ -142,7 +147,7 @@ class DLCRunner(BaseRunner):
                                        text=True,
                                        stdout=stdout,
                                        stderr=stdout)
-
+        finally:
            # Clean up
            os.remove(param_file)
        return task_name, result.returncode
--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@ -62,6 +62,7 @@ class LocalRunner(BaseRunner):
                # get cmd
                mmengine.mkdir_or_exist('tmp/')
                param_file = f'tmp/{os.getpid()}_params.py'
                try:
                    task.cfg.dump(param_file)
                    cmd = task.get_command(cfg_path=param_file,
                                           template='{task_cmd}')
@ -70,6 +71,7 @@ class LocalRunner(BaseRunner):
                        task.run()
                    else:
                        subprocess.run(cmd, shell=True, text=True)
                finally:
                    os.remove(param_file)
                status.append((task_name, 0))
        else:
@ -141,12 +143,15 @@ class LocalRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_{index}_params.py'
        try:
            task.cfg.dump(param_file)
            # Build up slurm command
            tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
            tmpl += ' {task_cmd}'
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+            get_cmd = partial(task.get_command,
                              cfg_path=param_file,
                              template=tmpl)
            cmd = get_cmd()
            logger = get_logger()
@ -165,7 +170,7 @@ class LocalRunner(BaseRunner):
            if result.returncode != 0:
                logger.warning(f'task {task_name} fail, see\n{out_path}')
-
+        finally:
            # Clean up
            os.remove(param_file)
        return task_name, result.returncode
--- a/opencompass/runners/slurm.py
+++ b/opencompass/runners/slurm.py
@ -91,6 +91,7 @@ class SlurmRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_params.py'
        try:
            task_cfg.dump(param_file)
            # Build up slurm command
@ -104,7 +105,9 @@ class SlurmRunner(BaseRunner):
            if num_gpus > 0:
                tmpl += f' --gres=gpu:{num_gpus}'
            tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+            get_cmd = partial(task.get_command,
                              cfg_path=param_file,
                              template=tmpl)
            cmd = get_cmd()
            logger = get_logger()
@ -128,7 +131,8 @@ class SlurmRunner(BaseRunner):
            retry = self.retry
            output_paths = task.get_output_paths()
-        while self._job_failed(result.returncode, output_paths) and retry > 0:
+            while self._job_failed(result.returncode,
                                   output_paths) and retry > 0:
                retry -= 1
                if random_sleep:
                    time.sleep(random.randint(0, 10))
@ -142,7 +146,7 @@ class SlurmRunner(BaseRunner):
            if result.returncode != 0 and not self.debug:
                logger.warning(f'task {task_name} fail, see\n{out_path}')
-
+        finally:
            # Clean up
            os.remove(param_file)
        return task_name, result.returncode
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@ -3,7 +3,9 @@ from typing import List, Union
 import tabulate
 from mmengine.config import Config
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
 from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
 from opencompass.utils import get_logger, match_files
@ -118,54 +120,61 @@ def exec_mm_infer_runner(tasks, args, cfg):
    runner(tasks)
-def exec_infer_runner(tasks, args, cfg):
+def get_config_type(obj) -> str:
-    """execute infer runner according to args."""
+    return f'{obj.__module__}.{obj.__name__}'
    if args.slurm:
        runner = SlurmRunner(dict(type='OpenICLInferTask'),
                             max_num_workers=args.max_num_workers,
                             partition=args.partition,
                             quotatype=args.quotatype,
                             qos=args.qos,
                             retry=args.retry,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
    elif args.dlc:
        runner = DLCRunner(dict(type='OpenICLInferTask'),
                           max_num_workers=args.max_num_workers,
                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
                           retry=args.retry,
                           debug=args.debug,
                           lark_bot_url=cfg['lark_bot_url'])
    else:
        runner = LocalRunner(task=dict(type='OpenICLInferTask'),
                             max_num_workers=args.max_num_workers,
                             max_workers_per_gpu=args.max_workers_per_gpu,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
    runner(tasks)
-def exec_eval_runner(tasks, args, cfg):
+def fill_infer_cfg(cfg, args):
-    """execute infer runner according to args."""
+    new_cfg = dict(infer=dict(
        partitioner=dict(type=get_config_type(SizePartitioner),
                         max_task_size=args.max_partition_size,
                         gen_task_coef=args.gen_task_coef),
        runner=dict(
            max_num_workers=args.max_num_workers,
            debug=args.debug,
            task=dict(type=get_config_type(OpenICLInferTask)),
            lark_bot_url=cfg['lark_bot_url'],
        )), )
    if args.slurm:
-        runner = SlurmRunner(dict(type='OpenICLEvalTask'),
+        new_cfg['infer']['runner']['type'] = get_config_type(SlurmRunner)
-                             max_num_workers=args.max_num_workers,
+        new_cfg['infer']['runner']['partition'] = args.partition
-                             partition=args.partition,
+        new_cfg['infer']['runner']['quotatype'] = args.quotatype
-                             quotatype=args.quotatype,
+        new_cfg['infer']['runner']['qos'] = args.qos
-                             qos=args.qos,
+        new_cfg['infer']['runner']['retry'] = args.retry
                             retry=args.retry,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
    elif args.dlc:
-        runner = DLCRunner(dict(type='OpenICLEvalTask'),
+        new_cfg['infer']['runner']['type'] = get_config_type(DLCRunner)
-                           max_num_workers=args.max_num_workers,
+        new_cfg['infer']['runner']['aliyun_cfg'] = Config.fromfile(
-                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
+            args.aliyun_cfg)
-                           retry=args.retry,
+        new_cfg['infer']['runner']['retry'] = args.retry
                           debug=args.debug,
                           lark_bot_url=cfg['lark_bot_url'])
    else:
-        runner = LocalRunner(task=dict(type='OpenICLEvalTask'),
+        new_cfg['infer']['runner']['type'] = get_config_type(LocalRunner)
        new_cfg['infer']['runner'][
            'max_workers_per_gpu'] = args.max_workers_per_gpu
    cfg.merge_from_dict(new_cfg)
 def fill_eval_cfg(cfg, args):
    new_cfg = dict(
        eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
                  runner=dict(
                      max_num_workers=args.max_num_workers,
                      debug=args.debug,
-                             lark_bot_url=cfg['lark_bot_url'])
+                      task=dict(type=get_config_type(OpenICLEvalTask)),
-    runner(tasks)
+                      lark_bot_url=cfg['lark_bot_url'],
                  )))
    if args.slurm:
        new_cfg['eval']['runner']['type'] = get_config_type(SlurmRunner)
        new_cfg['eval']['runner']['partition'] = args.partition
        new_cfg['eval']['runner']['quotatype'] = args.quotatype
        new_cfg['eval']['runner']['qos'] = args.qos
        new_cfg['eval']['runner']['retry'] = args.retry
    elif args.dlc:
        new_cfg['eval']['runner']['type'] = get_config_type(DLCRunner)
        new_cfg['eval']['runner']['aliyun_cfg'] = Config.fromfile(
            args.aliyun_cfg)
        new_cfg['eval']['runner']['retry'] = args.retry
    else:
        new_cfg['eval']['runner']['type'] = get_config_type(LocalRunner)
        new_cfg['eval']['runner'][
            'max_workers_per_gpu'] = args.max_workers_per_gpu
    cfg.merge_from_dict(new_cfg)
--- a/run.py
+++ b/run.py
@ -6,13 +6,12 @@ from datetime import datetime
 from mmengine.config import Config, DictAction
-from opencompass.partitioners import (MultimodalNaivePartitioner,
+from opencompass.partitioners import MultimodalNaivePartitioner
                                      NaivePartitioner, SizePartitioner)
 from opencompass.registry import PARTITIONERS, RUNNERS
 from opencompass.runners import SlurmRunner
 from opencompass.utils import LarkReporter, Summarizer, get_logger
-from opencompass.utils.run import (exec_eval_runner, exec_infer_runner,
+from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg,
-                                   exec_mm_infer_runner, get_config_from_arg)
+                                   fill_infer_cfg, get_config_from_arg)
 def parse_args():
@ -245,20 +244,10 @@ def main():
            tasks = partitioner(cfg)
            exec_mm_infer_runner(tasks, args, cfg)
            return
-        elif args.dlc or args.slurm or cfg.get('infer', None) is None:
+
-            # Use SizePartitioner to split into subtasks
+        if args.dlc or args.slurm or cfg.get('infer', None) is None:
-            partitioner = SizePartitioner(
+            fill_infer_cfg(cfg, args)
-                osp.join(cfg['work_dir'], 'predictions/'),
+
                max_task_size=args.max_partition_size,
                gen_task_coef=args.gen_task_coef)
            tasks = partitioner(cfg)
            if args.dry_run:
                return
            # execute the infer subtasks
            exec_infer_runner(tasks, args, cfg)
        # If they have specified "infer" in config and haven't used --slurm
        # or --dlc, just follow the config
        else:
        if args.partition is not None:
            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
                cfg.infer.runner.partition = args.partition
@ -270,8 +259,8 @@ def main():
            cfg.infer.runner.debug = True
        if args.lark:
            cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
-            cfg.infer.partitioner['out_dir'] = osp.join(
+        cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
-                cfg['work_dir'], 'predictions/')
+                                                    'predictions/')
        partitioner = PARTITIONERS.build(cfg.infer.partitioner)
        tasks = partitioner(cfg)
        if args.dry_run:
@ -289,18 +278,10 @@ def main():
                           'also specified --slurm or --dlc. '
                           'The "eval" configuration will be overridden by '
                           'your runtime arguments.')
        if args.dlc or args.slurm or cfg.get('eval', None) is None:
-            # Use NaivePartitioner，not split
+            fill_eval_cfg(cfg, args)
-            partitioner = NaivePartitioner(
+
                osp.join(cfg['work_dir'], 'results/'))
            tasks = partitioner(cfg)
            if args.dry_run:
                return
            # execute the eval tasks
            exec_eval_runner(tasks, args, cfg)
        # If they have specified "eval" in config and haven't used --slurm
        # or --dlc, just follow the config
        else:
        if args.partition is not None:
            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
                cfg.eval.runner.partition = args.partition
@ -312,8 +293,7 @@ def main():
            cfg.eval.runner.debug = True
        if args.lark:
            cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
-            cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'],
+        cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
                                                       'results/')
        partitioner = PARTITIONERS.build(cfg.eval.partitioner)
        tasks = partitioner(cfg)
        if args.dry_run: