Auto re-generate port number during retry (#24)

* Auto re-generate port number during retry * Fix slurm command
2025-05-30 16:03:24 +08:00 · 2023-07-07 17:25:56 +08:00 · 2023-07-07 17:25:56 +08:00 · 805293a9f2
commit 805293a9f2
parent efdf116f18
6 changed files with 61 additions and 53 deletions
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@ -1,9 +1,9 @@
-import inspect
 import os
 import os.path as osp
 import random
 import subprocess
 import time
+from functools import partial
 from typing import Any, Dict, List, Tuple

 import mmengine
@ -82,7 +82,6 @@ class DLCRunner(BaseRunner):
        task = task_type(task_cfg)
        num_gpus = task.num_gpus
        task_name = task.name
-        script_path = inspect.getsourcefile(task_type)

        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
@ -90,28 +89,26 @@ class DLCRunner(BaseRunner):
        task_cfg.dump(param_file)

        # Build up DLC command
-        task_cmd_template = task.get_command_template()
-        task_cmd = task_cmd_template.replace('{SCRIPT_PATH}',
-                                             script_path).replace(
-                                                 '{CFG_PATH}', param_file)
        pwd = os.getcwd()
        shell_cmd = (f'source {self.aliyun_cfg["bashrc_path"]}; '
                     f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
                     f'cd {pwd}; '
-                     f'{task_cmd}')
+                     '{task_cmd}')

-        cmd = ('dlc create job'
-               f" --command '{shell_cmd}'"
-               f' --name {task_name[:512]}'
-               ' --kind BatchJob'
-               f" -c {self.aliyun_cfg['dlc_config_path']}"
-               f" --workspace_id {self.aliyun_cfg['workspace_id']}"
-               ' --worker_count 1'
-               f' --worker_cpu {max(num_gpus * 6, 8)}'
-               f' --worker_gpu {num_gpus}'
-               f' --worker_memory {max(num_gpus * 32, 48)}'
-               f" --worker_image {self.aliyun_cfg['worker_image']}"
-               ' --interactive')
+        tmpl = ('dlc create job'
+                f" --command '{shell_cmd}'"
+                f' --name {task_name[:512]}'
+                ' --kind BatchJob'
+                f" -c {self.aliyun_cfg['dlc_config_path']}"
+                f" --workspace_id {self.aliyun_cfg['workspace_id']}"
+                ' --worker_count 1'
+                f' --worker_cpu {max(num_gpus * 6, 8)}'
+                f' --worker_gpu {num_gpus}'
+                f' --worker_memory {max(num_gpus * 32, 48)}'
+                f" --worker_image {self.aliyun_cfg['worker_image']}"
+                ' --interactive')
+        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+        cmd = get_cmd()

        logger = get_logger()
        logger.debug(f'Running command: {cmd}')
@ -138,6 +135,8 @@ class DLCRunner(BaseRunner):
            retry -= 1
            if random_sleep:
                time.sleep(random.randint(0, 10))
+            # Re-generate command to refresh ports.
+            cmd = get_cmd()
            result = subprocess.run(cmd,
                                    shell=True,
                                    text=True,
--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@ -1,9 +1,9 @@
-import inspect
 import os
 import os.path as osp
 import subprocess
 import time
 from concurrent.futures import ThreadPoolExecutor
+from functools import partial
 from threading import Lock
 from typing import Any, Dict, List, Tuple

@ -108,7 +108,6 @@ class LocalRunner(BaseRunner):
        """

        task_name = task.name
-        script_path = inspect.getsourcefile(type(task))

        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
@ -116,12 +115,11 @@ class LocalRunner(BaseRunner):
        task.cfg.dump(param_file)

        # Build up slurm command
-        task_cmd_template = task.get_command_template()
-        task_cmd = task_cmd_template.replace('{SCRIPT_PATH}',
-                                             script_path).replace(
-                                                 '{CFG_PATH}', param_file)
-        cmd = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids) + ' '
-        cmd += task_cmd
+        tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
+        tmpl += ' {task_cmd}'
+        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+        cmd = get_cmd()
+
        logger = get_logger()
        logger.debug(f'Running command: {cmd}')

--- a/opencompass/runners/slurm.py
+++ b/opencompass/runners/slurm.py
@ -1,9 +1,9 @@
-import inspect
 import os
 import os.path as osp
 import random
 import subprocess
 import time
+from functools import partial
 from typing import Any, Dict, List, Tuple

 import mmengine
@ -85,7 +85,6 @@ class SlurmRunner(BaseRunner):
        task = task_type(task_cfg)
        num_gpus = task.num_gpus
        task_name = task.name
-        script_path = inspect.getsourcefile(task_type)

        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
@ -93,18 +92,17 @@ class SlurmRunner(BaseRunner):
        task_cfg.dump(param_file)

        # Build up slurm command
-        task_cmd_template = task.get_command_template()
-        task_cmd = task_cmd_template.replace('{SCRIPT_PATH}',
-                                             script_path).replace(
-                                                 '{CFG_PATH}', param_file)
-        cmd = 'srun'
+        tmpl = 'srun'
        if self.partition:
-            cmd += f' -p {self.partition}'
+            tmpl += f' -p {self.partition}'
        if self.quotatype:
-            cmd += f' --quotatype={self.quotatype}'
+            tmpl += f' --quotatype={self.quotatype}'
        if num_gpus > 0:
-            cmd += f' --gres=gpu:{num_gpus}'
-        cmd += f" -N1 -J '{task_name[:512]}' {task_cmd}"
+            tmpl += f' --gres=gpu:{num_gpus}'
+        tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
+        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+        cmd = get_cmd()
+
        logger = get_logger()
        logger.debug(f'Running command: {cmd}')

@ -130,6 +128,8 @@ class SlurmRunner(BaseRunner):
            retry -= 1
            if random_sleep:
                time.sleep(random.randint(0, 10))
+            # Re-generate command to refresh ports.
+            cmd = get_cmd()
            result = subprocess.run(cmd,
                                    shell=True,
                                    text=True,
--- a/opencompass/tasks/base.py
+++ b/opencompass/tasks/base.py
@ -10,7 +10,7 @@ from opencompass.utils import get_infer_output_path, task_abbr_from_cfg
 class BaseTask:
    """Base class for all tasks. There are two ways to run the task:
    1. Directly by calling the `run` method.
-    2. Calling the `get_command_template` method to get the command template,
+    2. Calling the `get_command` method to get the command,
        and then run the command in the shell.

    Args:
@ -35,15 +35,13 @@ class BaseTask:
        """Run the task."""

    @abstractmethod
-    def get_command_template(self) -> str:
+    def get_command(self, cfg_path, template) -> str:
        """Get the command template for the task.

-        The command template should
-        contain the following placeholders:
-        1. ``{SCRIPT_PATH}``: This placeholder will be replaced by the path to
-            the script file of the task.
-        2. ``{CFG_PATH}`` This placeholder will be replaced by the
-            path to the config file of the task.
+        Args:
+            cfg_path (str): The path to the config file of the task.
+            template (str): The template which have '{task_cmd}' to format
+                the command.
        """

    @property
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@ -31,8 +31,10 @@ class OpenICLEvalTask(BaseTask):
        self.num_gpus = 0
        self.logger = get_logger()

-    def get_command_template(self):
-        return 'python3 {SCRIPT_PATH} {CFG_PATH}'
+    def get_command(self, cfg_path, template):
+        script_path = __file__
+        command = f'python3 {script_path} {cfg_path}'
+        return template.format(task_cmd=command)

    def run(self):
        for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
--- a/opencompass/tasks/openicl_infer.py
+++ b/opencompass/tasks/openicl_infer.py
@ -31,13 +31,24 @@ class OpenICLInferTask(BaseTask):
        self.num_gpus = run_cfg.get('num_gpus', 0)
        self.num_procs = run_cfg.get('num_procs', 1)

-    def get_command_template(self):
+    def get_command(self, cfg_path, template):
+        """Get the command template for the task.
+
+        Args:
+            cfg_path (str): The path to the config file of the task.
+            template (str): The template which have '{task_cmd}' to format
+                the command.
+        """
+        script_path = __file__
        if self.num_gpus > 0:
-            return (f'torchrun --master_port={random.randint(12000, 32000)} '
-                    f'--nproc_per_node {self.num_procs} '
-                    '{SCRIPT_PATH} {CFG_PATH}')
+            port = random.randint(12000, 32000)
+            command = (f'torchrun --master_port={port} '
+                       f'--nproc_per_node {self.num_procs} '
+                       f'{script_path} {cfg_path}')
        else:
-            return ('python {SCRIPT_PATH} {CFG_PATH}')
+            command = 'python {script_path} {cfg_path}'
+
+        return template.format(task_cmd=command)

    def run(self):
        for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):