diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index a2d2c607..87c51a76 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -1,9 +1,9 @@ -import inspect import os import os.path as osp import random import subprocess import time +from functools import partial from typing import Any, Dict, List, Tuple import mmengine @@ -82,7 +82,6 @@ class DLCRunner(BaseRunner): task = task_type(task_cfg) num_gpus = task.num_gpus task_name = task.name - script_path = inspect.getsourcefile(task_type) # Dump task config to file mmengine.mkdir_or_exist('tmp/') @@ -90,28 +89,26 @@ class DLCRunner(BaseRunner): task_cfg.dump(param_file) # Build up DLC command - task_cmd_template = task.get_command_template() - task_cmd = task_cmd_template.replace('{SCRIPT_PATH}', - script_path).replace( - '{CFG_PATH}', param_file) pwd = os.getcwd() shell_cmd = (f'source {self.aliyun_cfg["bashrc_path"]}; ' f'conda activate {self.aliyun_cfg["conda_env_name"]}; ' f'cd {pwd}; ' - f'{task_cmd}') + '{task_cmd}') - cmd = ('dlc create job' - f" --command '{shell_cmd}'" - f' --name {task_name[:512]}' - ' --kind BatchJob' - f" -c {self.aliyun_cfg['dlc_config_path']}" - f" --workspace_id {self.aliyun_cfg['workspace_id']}" - ' --worker_count 1' - f' --worker_cpu {max(num_gpus * 6, 8)}' - f' --worker_gpu {num_gpus}' - f' --worker_memory {max(num_gpus * 32, 48)}' - f" --worker_image {self.aliyun_cfg['worker_image']}" - ' --interactive') + tmpl = ('dlc create job' + f" --command '{shell_cmd}'" + f' --name {task_name[:512]}' + ' --kind BatchJob' + f" -c {self.aliyun_cfg['dlc_config_path']}" + f" --workspace_id {self.aliyun_cfg['workspace_id']}" + ' --worker_count 1' + f' --worker_cpu {max(num_gpus * 6, 8)}' + f' --worker_gpu {num_gpus}' + f' --worker_memory {max(num_gpus * 32, 48)}' + f" --worker_image {self.aliyun_cfg['worker_image']}" + ' --interactive') + get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl) + cmd = get_cmd() logger = get_logger() logger.debug(f'Running command: {cmd}') @@ -138,6 +135,8 @@ class DLCRunner(BaseRunner): retry -= 1 if random_sleep: time.sleep(random.randint(0, 10)) + # Re-generate command to refresh ports. + cmd = get_cmd() result = subprocess.run(cmd, shell=True, text=True, diff --git a/opencompass/runners/local.py b/opencompass/runners/local.py index 803ceb10..eb26e355 100644 --- a/opencompass/runners/local.py +++ b/opencompass/runners/local.py @@ -1,9 +1,9 @@ -import inspect import os import os.path as osp import subprocess import time from concurrent.futures import ThreadPoolExecutor +from functools import partial from threading import Lock from typing import Any, Dict, List, Tuple @@ -108,7 +108,6 @@ class LocalRunner(BaseRunner): """ task_name = task.name - script_path = inspect.getsourcefile(type(task)) # Dump task config to file mmengine.mkdir_or_exist('tmp/') @@ -116,12 +115,11 @@ class LocalRunner(BaseRunner): task.cfg.dump(param_file) # Build up slurm command - task_cmd_template = task.get_command_template() - task_cmd = task_cmd_template.replace('{SCRIPT_PATH}', - script_path).replace( - '{CFG_PATH}', param_file) - cmd = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids) + ' ' - cmd += task_cmd + tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids) + tmpl += ' {task_cmd}' + get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl) + cmd = get_cmd() + logger = get_logger() logger.debug(f'Running command: {cmd}') diff --git a/opencompass/runners/slurm.py b/opencompass/runners/slurm.py index 448f5b2c..646adcb8 100644 --- a/opencompass/runners/slurm.py +++ b/opencompass/runners/slurm.py @@ -1,9 +1,9 @@ -import inspect import os import os.path as osp import random import subprocess import time +from functools import partial from typing import Any, Dict, List, Tuple import mmengine @@ -85,7 +85,6 @@ class SlurmRunner(BaseRunner): task = task_type(task_cfg) num_gpus = task.num_gpus task_name = task.name - script_path = inspect.getsourcefile(task_type) # Dump task config to file mmengine.mkdir_or_exist('tmp/') @@ -93,18 +92,17 @@ class SlurmRunner(BaseRunner): task_cfg.dump(param_file) # Build up slurm command - task_cmd_template = task.get_command_template() - task_cmd = task_cmd_template.replace('{SCRIPT_PATH}', - script_path).replace( - '{CFG_PATH}', param_file) - cmd = 'srun' + tmpl = 'srun' if self.partition: - cmd += f' -p {self.partition}' + tmpl += f' -p {self.partition}' if self.quotatype: - cmd += f' --quotatype={self.quotatype}' + tmpl += f' --quotatype={self.quotatype}' if num_gpus > 0: - cmd += f' --gres=gpu:{num_gpus}' - cmd += f" -N1 -J '{task_name[:512]}' {task_cmd}" + tmpl += f' --gres=gpu:{num_gpus}' + tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}' + get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl) + cmd = get_cmd() + logger = get_logger() logger.debug(f'Running command: {cmd}') @@ -130,6 +128,8 @@ class SlurmRunner(BaseRunner): retry -= 1 if random_sleep: time.sleep(random.randint(0, 10)) + # Re-generate command to refresh ports. + cmd = get_cmd() result = subprocess.run(cmd, shell=True, text=True, diff --git a/opencompass/tasks/base.py b/opencompass/tasks/base.py index 214c93b1..d153821e 100644 --- a/opencompass/tasks/base.py +++ b/opencompass/tasks/base.py @@ -10,7 +10,7 @@ from opencompass.utils import get_infer_output_path, task_abbr_from_cfg class BaseTask: """Base class for all tasks. There are two ways to run the task: 1. Directly by calling the `run` method. - 2. Calling the `get_command_template` method to get the command template, + 2. Calling the `get_command` method to get the command, and then run the command in the shell. Args: @@ -35,15 +35,13 @@ class BaseTask: """Run the task.""" @abstractmethod - def get_command_template(self) -> str: + def get_command(self, cfg_path, template) -> str: """Get the command template for the task. - The command template should - contain the following placeholders: - 1. ``{SCRIPT_PATH}``: This placeholder will be replaced by the path to - the script file of the task. - 2. ``{CFG_PATH}`` This placeholder will be replaced by the - path to the config file of the task. + Args: + cfg_path (str): The path to the config file of the task. + template (str): The template which have '{task_cmd}' to format + the command. """ @property diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py index 71e3fc06..e1d0ef5c 100644 --- a/opencompass/tasks/openicl_eval.py +++ b/opencompass/tasks/openicl_eval.py @@ -31,8 +31,10 @@ class OpenICLEvalTask(BaseTask): self.num_gpus = 0 self.logger = get_logger() - def get_command_template(self): - return 'python3 {SCRIPT_PATH} {CFG_PATH}' + def get_command(self, cfg_path, template): + script_path = __file__ + command = f'python3 {script_path} {cfg_path}' + return template.format(task_cmd=command) def run(self): for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs): diff --git a/opencompass/tasks/openicl_infer.py b/opencompass/tasks/openicl_infer.py index e0e6025d..c350c680 100644 --- a/opencompass/tasks/openicl_infer.py +++ b/opencompass/tasks/openicl_infer.py @@ -31,13 +31,24 @@ class OpenICLInferTask(BaseTask): self.num_gpus = run_cfg.get('num_gpus', 0) self.num_procs = run_cfg.get('num_procs', 1) - def get_command_template(self): + def get_command(self, cfg_path, template): + """Get the command template for the task. + + Args: + cfg_path (str): The path to the config file of the task. + template (str): The template which have '{task_cmd}' to format + the command. + """ + script_path = __file__ if self.num_gpus > 0: - return (f'torchrun --master_port={random.randint(12000, 32000)} ' - f'--nproc_per_node {self.num_procs} ' - '{SCRIPT_PATH} {CFG_PATH}') + port = random.randint(12000, 32000) + command = (f'torchrun --master_port={port} ' + f'--nproc_per_node {self.num_procs} ' + f'{script_path} {cfg_path}') else: - return ('python {SCRIPT_PATH} {CFG_PATH}') + command = 'python {script_path} {cfg_path}' + + return template.format(task_cmd=command) def run(self): for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):