diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 9064d5b1..094c4269 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -42,14 +42,15 @@ class DLCRunner(BaseRunner): eval_with_gpu: list = ['plugin_eval'], retry: int = 2, debug: bool = False, - lark_bot_url: str = None): + lark_bot_url: str = None, + keep_tmp_file: bool = False): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.aliyun_cfg = aliyun_cfg self.max_num_workers = max_num_workers self.retry = retry self.eval_with_gpu = eval_with_gpu - + self.keep_tmp_file = keep_tmp_file logger = get_logger() logger.warning( 'To ensure the integrity of the log results, the log displayed ' @@ -106,7 +107,10 @@ class DLCRunner(BaseRunner): # Dump task config to file mmengine.mkdir_or_exist('tmp/') - param_file = f'tmp/{os.getpid()}_params.py' + # Using uuid to avoid filename conflict + import uuid + uuid_str = str(uuid.uuid4()) + param_file = f'tmp/{uuid_str}_params.py' pwd = os.getcwd() try: cfg.dump(param_file) @@ -305,7 +309,10 @@ class DLCRunner(BaseRunner): return_code = _run_within_retry() finally: # Clean up - os.remove(param_file) + if not self.keep_tmp_file: + os.remove(param_file) + else: + pass return task_name, return_code diff --git a/opencompass/runners/local.py b/opencompass/runners/local.py index 3ec1c627..8306e89e 100644 --- a/opencompass/runners/local.py +++ b/opencompass/runners/local.py @@ -56,10 +56,12 @@ class LocalRunner(BaseRunner): debug: bool = False, max_workers_per_gpu: int = 1, lark_bot_url: str = None, + keep_tmp_file: bool = False, **kwargs): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.max_num_workers = max_num_workers self.max_workers_per_gpu = max_workers_per_gpu + self.keep_tmp_file = keep_tmp_file logger = get_logger() for k, v in kwargs.items(): logger.warning(f'Ignored argument in {self.__module__}: {k}={v}') @@ -100,7 +102,10 @@ class LocalRunner(BaseRunner): assert len(all_gpu_ids) >= num_gpus # get cmd mmengine.mkdir_or_exist('tmp/') - param_file = f'tmp/{os.getpid()}_params.py' + import uuid + uuid_str = str(uuid.uuid4()) + + param_file = f'tmp/{uuid_str}_params.py' try: task.cfg.dump(param_file) # if use torchrun, restrict it behaves the same as non @@ -140,7 +145,10 @@ class LocalRunner(BaseRunner): stdout=log_file, stderr=subprocess.STDOUT) finally: - os.remove(param_file) + if not self.keep_tmp_file: + os.remove(param_file) + else: + pass status.append((task_name, 0)) else: if len(all_gpu_ids) > 0: diff --git a/opencompass/runners/slurm_sequential.py b/opencompass/runners/slurm_sequential.py index 3b4dcad5..5dee149c 100644 --- a/opencompass/runners/slurm_sequential.py +++ b/opencompass/runners/slurm_sequential.py @@ -24,11 +24,11 @@ class SlurmSequentialRunner(BaseRunner): using `srun` command. This runner launches tasks one by one for execution. A new task will only - be launched when and only when max_num_workers is not met, and the previous - task has been successfully allocated to a machine. Therefore, unlike the - `SlurmRunner`, at most only one task will be in the PENDING status at the - same time during a run, making the random_sleep strategy no longer - necessary. In addition, this runner also includes a feature to + be launched when and only when max_num_workers is not met, and the + previous task has been successfully allocated to a machine. Therefore, + unlike the `SlurmRunner`, at most only one task will be in the PENDING + status at the same time during a run, making the random_sleep strategy + no longer necessary. In addition, this runner also includes a feature to automatically kill all jobs by the job_id on exit. The runner will obtain the job_id by reading the srun output similar to @@ -59,7 +59,8 @@ class SlurmSequentialRunner(BaseRunner): qos: str = None, debug: bool = False, lark_bot_url: str = None, - extra_command: Optional[List[str]] = None): + extra_command: Optional[List[str]] = None, + keep_tmp_file: bool = False): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.max_num_workers = max_num_workers self.retry = retry @@ -67,6 +68,7 @@ class SlurmSequentialRunner(BaseRunner): self.quotatype = quotatype self.qos = qos self.task_prefix = task_prefix + self.keep_tmp_file = keep_tmp_file if not extra_command: extra_command = [] assert isinstance(extra_command, list) @@ -171,7 +173,10 @@ class SlurmSequentialRunner(BaseRunner): # Dump task config to file mmengine.mkdir_or_exist('tmp/') - param_file = f'tmp/{os.getpid()}_params.py' + # Using uuid to avoid filename conflict + import uuid + uuid_str = str(uuid.uuid4()) + param_file = f'tmp/{uuid_str}_params.py' process = None try: cfg.dump(param_file) @@ -256,7 +261,11 @@ class SlurmSequentialRunner(BaseRunner): child_conn.close() if process is not None: process.kill() - os.remove(param_file) + if not self.keep_tmp_file: + os.remove(param_file) + else: + pass + return task_name, process.returncode def _job_failed(self, return_code: int, output_paths: List[str]) -> bool: diff --git a/opencompass/runners/volc.py b/opencompass/runners/volc.py index 9a94ea67..f076daa6 100644 --- a/opencompass/runners/volc.py +++ b/opencompass/runners/volc.py @@ -47,7 +47,8 @@ class VOLCRunner(BaseRunner): max_num_workers: int = 32, retry: int = 2, debug: bool = False, - lark_bot_url: str = None): + lark_bot_url: str = None, + keep_tmp_file: bool = False): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.volcano_cfg = volcano_cfg self.max_num_workers = max_num_workers @@ -55,6 +56,7 @@ class VOLCRunner(BaseRunner): self.queue_name = queue_name self.preemptible = preemptible self.priority = priority + self.keep_tmp_file = keep_tmp_file def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]: """Launch multiple tasks. @@ -100,9 +102,12 @@ class VOLCRunner(BaseRunner): pwd = os.getcwd() # Dump task config to file mmengine.mkdir_or_exist('tmp/') - param_file = f'{pwd}/tmp/{os.getpid()}_params.py' + # Using uuid to avoid filename conflict + import uuid + uuid_str = str(uuid.uuid4()) + param_file = f'{pwd}/tmp/{uuid_str}_params.py' - volc_cfg_file = f'{pwd}/tmp/{os.getpid()}_cfg.yaml' + volc_cfg_file = f'{pwd}/tmp/{uuid_str}_cfg.yaml' volc_cfg = self._choose_flavor(num_gpus) with open(volc_cfg_file, 'w') as fp: yaml.dump(volc_cfg, fp, sort_keys=False) @@ -191,8 +196,12 @@ class VOLCRunner(BaseRunner): finally: # Clean up - os.remove(param_file) - os.remove(volc_cfg_file) + if not self.keep_tmp_file: + os.remove(param_file) + os.remove(volc_cfg_file) + else: + pass + return task_name, returncode def _run_task(self, cmd, log_path, poll_interval):