[Update] Fix issue of *_param.py, avoid name conflict;add keep_tmp_file flag to support keep the temp config file. (#1640)

2025-05-30 16:03:24 +08:00 · 2024-10-25 16:39:25 +08:00 · 2024-10-25 16:39:25 +08:00 · 84be90669b
commit 84be90669b
parent 2542bc6907
4 changed files with 52 additions and 19 deletions
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@ -42,14 +42,15 @@ class DLCRunner(BaseRunner):
                 eval_with_gpu: list = ['plugin_eval'],
                 retry: int = 2,
                 debug: bool = False,
-                 lark_bot_url: str = None):
+                 lark_bot_url: str = None,
+                 keep_tmp_file: bool = False):
        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
        self.aliyun_cfg = aliyun_cfg
        self.max_num_workers = max_num_workers
        self.retry = retry

        self.eval_with_gpu = eval_with_gpu
-
+        self.keep_tmp_file = keep_tmp_file
        logger = get_logger()
        logger.warning(
            'To ensure the integrity of the log results, the log displayed '
@ -106,7 +107,10 @@ class DLCRunner(BaseRunner):

        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
-        param_file = f'tmp/{os.getpid()}_params.py'
+        # Using uuid to avoid filename conflict
+        import uuid
+        uuid_str = str(uuid.uuid4())
+        param_file = f'tmp/{uuid_str}_params.py'
        pwd = os.getcwd()
        try:
            cfg.dump(param_file)
@ -305,7 +309,10 @@ class DLCRunner(BaseRunner):
                return_code = _run_within_retry()
        finally:
            # Clean up
-            os.remove(param_file)
+            if not self.keep_tmp_file:
+                os.remove(param_file)
+            else:
+                pass

        return task_name, return_code

--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@ -56,10 +56,12 @@ class LocalRunner(BaseRunner):
                 debug: bool = False,
                 max_workers_per_gpu: int = 1,
                 lark_bot_url: str = None,
+                 keep_tmp_file: bool = False,
                 **kwargs):
        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
        self.max_num_workers = max_num_workers
        self.max_workers_per_gpu = max_workers_per_gpu
+        self.keep_tmp_file = keep_tmp_file
        logger = get_logger()
        for k, v in kwargs.items():
            logger.warning(f'Ignored argument in {self.__module__}: {k}={v}')
@ -100,7 +102,10 @@ class LocalRunner(BaseRunner):
                assert len(all_gpu_ids) >= num_gpus
                # get cmd
                mmengine.mkdir_or_exist('tmp/')
-                param_file = f'tmp/{os.getpid()}_params.py'
+                import uuid
+                uuid_str = str(uuid.uuid4())
+
+                param_file = f'tmp/{uuid_str}_params.py'
                try:
                    task.cfg.dump(param_file)
                    # if use torchrun, restrict it behaves the same as non
@ -140,7 +145,10 @@ class LocalRunner(BaseRunner):
                                           stdout=log_file,
                                           stderr=subprocess.STDOUT)
                finally:
-                    os.remove(param_file)
+                    if not self.keep_tmp_file:
+                        os.remove(param_file)
+                    else:
+                        pass
                status.append((task_name, 0))
        else:
            if len(all_gpu_ids) > 0:
--- a/opencompass/runners/slurm_sequential.py
+++ b/opencompass/runners/slurm_sequential.py
@ -24,11 +24,11 @@ class SlurmSequentialRunner(BaseRunner):
    using `srun` command.

    This runner launches tasks one by one for execution. A new task will only
-    be launched when and only when max_num_workers is not met, and the previous
-    task has been successfully allocated to a machine. Therefore, unlike the
-    `SlurmRunner`, at most only one task will be in the PENDING status at the
-    same time during a run, making the random_sleep strategy no longer
-    necessary. In addition, this runner also includes a feature to
+    be launched when and only when  max_num_workers is not met, and the
+    previous task has been successfully allocated to a machine. Therefore,
+    unlike the `SlurmRunner`, at most only one task will be in the PENDING
+    status at the same time during a run, making the random_sleep strategy
+    no longer necessary. In addition, this runner also includes a feature to
    automatically kill all jobs by the job_id on exit.

    The runner will obtain the job_id by reading the srun output similar to
@ -59,7 +59,8 @@ class SlurmSequentialRunner(BaseRunner):
                 qos: str = None,
                 debug: bool = False,
                 lark_bot_url: str = None,
-                 extra_command: Optional[List[str]] = None):
+                 extra_command: Optional[List[str]] = None,
+                 keep_tmp_file: bool = False):
        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
        self.max_num_workers = max_num_workers
        self.retry = retry
@ -67,6 +68,7 @@ class SlurmSequentialRunner(BaseRunner):
        self.quotatype = quotatype
        self.qos = qos
        self.task_prefix = task_prefix
+        self.keep_tmp_file = keep_tmp_file
        if not extra_command:
            extra_command = []
        assert isinstance(extra_command, list)
@ -171,7 +173,10 @@ class SlurmSequentialRunner(BaseRunner):

        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
-        param_file = f'tmp/{os.getpid()}_params.py'
+        # Using uuid to avoid filename conflict
+        import uuid
+        uuid_str = str(uuid.uuid4())
+        param_file = f'tmp/{uuid_str}_params.py'
        process = None
        try:
            cfg.dump(param_file)
@ -256,7 +261,11 @@ class SlurmSequentialRunner(BaseRunner):
                child_conn.close()
            if process is not None:
                process.kill()
-            os.remove(param_file)
+            if not self.keep_tmp_file:
+                os.remove(param_file)
+            else:
+                pass
+
        return task_name, process.returncode

    def _job_failed(self, return_code: int, output_paths: List[str]) -> bool:
--- a/opencompass/runners/volc.py
+++ b/opencompass/runners/volc.py
@ -47,7 +47,8 @@ class VOLCRunner(BaseRunner):
                 max_num_workers: int = 32,
                 retry: int = 2,
                 debug: bool = False,
-                 lark_bot_url: str = None):
+                 lark_bot_url: str = None,
+                 keep_tmp_file: bool = False):
        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
        self.volcano_cfg = volcano_cfg
        self.max_num_workers = max_num_workers
@ -55,6 +56,7 @@ class VOLCRunner(BaseRunner):
        self.queue_name = queue_name
        self.preemptible = preemptible
        self.priority = priority
+        self.keep_tmp_file = keep_tmp_file

    def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
        """Launch multiple tasks.
@ -100,9 +102,12 @@ class VOLCRunner(BaseRunner):
        pwd = os.getcwd()
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
-        param_file = f'{pwd}/tmp/{os.getpid()}_params.py'
+        # Using uuid to avoid filename conflict
+        import uuid
+        uuid_str = str(uuid.uuid4())
+        param_file = f'{pwd}/tmp/{uuid_str}_params.py'

-        volc_cfg_file = f'{pwd}/tmp/{os.getpid()}_cfg.yaml'
+        volc_cfg_file = f'{pwd}/tmp/{uuid_str}_cfg.yaml'
        volc_cfg = self._choose_flavor(num_gpus)
        with open(volc_cfg_file, 'w') as fp:
            yaml.dump(volc_cfg, fp, sort_keys=False)
@ -191,8 +196,12 @@ class VOLCRunner(BaseRunner):

        finally:
            # Clean up
-            os.remove(param_file)
-            os.remove(volc_cfg_file)
+            if not self.keep_tmp_file:
+                os.remove(param_file)
+                os.remove(volc_cfg_file)
+            else:
+                pass
+
        return task_name, returncode

    def _run_task(self, cmd, log_path, poll_interval):