mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Auto re-generate port number during retry (#24)
* Auto re-generate port number during retry * Fix slurm command
This commit is contained in:
parent
efdf116f18
commit
805293a9f2
@ -1,9 +1,9 @@
|
||||
import inspect
|
||||
import os
|
||||
import os.path as osp
|
||||
import random
|
||||
import subprocess
|
||||
import time
|
||||
from functools import partial
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
import mmengine
|
||||
@ -82,7 +82,6 @@ class DLCRunner(BaseRunner):
|
||||
task = task_type(task_cfg)
|
||||
num_gpus = task.num_gpus
|
||||
task_name = task.name
|
||||
script_path = inspect.getsourcefile(task_type)
|
||||
|
||||
# Dump task config to file
|
||||
mmengine.mkdir_or_exist('tmp/')
|
||||
@ -90,17 +89,13 @@ class DLCRunner(BaseRunner):
|
||||
task_cfg.dump(param_file)
|
||||
|
||||
# Build up DLC command
|
||||
task_cmd_template = task.get_command_template()
|
||||
task_cmd = task_cmd_template.replace('{SCRIPT_PATH}',
|
||||
script_path).replace(
|
||||
'{CFG_PATH}', param_file)
|
||||
pwd = os.getcwd()
|
||||
shell_cmd = (f'source {self.aliyun_cfg["bashrc_path"]}; '
|
||||
f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
|
||||
f'cd {pwd}; '
|
||||
f'{task_cmd}')
|
||||
'{task_cmd}')
|
||||
|
||||
cmd = ('dlc create job'
|
||||
tmpl = ('dlc create job'
|
||||
f" --command '{shell_cmd}'"
|
||||
f' --name {task_name[:512]}'
|
||||
' --kind BatchJob'
|
||||
@ -112,6 +107,8 @@ class DLCRunner(BaseRunner):
|
||||
f' --worker_memory {max(num_gpus * 32, 48)}'
|
||||
f" --worker_image {self.aliyun_cfg['worker_image']}"
|
||||
' --interactive')
|
||||
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
|
||||
cmd = get_cmd()
|
||||
|
||||
logger = get_logger()
|
||||
logger.debug(f'Running command: {cmd}')
|
||||
@ -138,6 +135,8 @@ class DLCRunner(BaseRunner):
|
||||
retry -= 1
|
||||
if random_sleep:
|
||||
time.sleep(random.randint(0, 10))
|
||||
# Re-generate command to refresh ports.
|
||||
cmd = get_cmd()
|
||||
result = subprocess.run(cmd,
|
||||
shell=True,
|
||||
text=True,
|
||||
|
@ -1,9 +1,9 @@
|
||||
import inspect
|
||||
import os
|
||||
import os.path as osp
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from threading import Lock
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
@ -108,7 +108,6 @@ class LocalRunner(BaseRunner):
|
||||
"""
|
||||
|
||||
task_name = task.name
|
||||
script_path = inspect.getsourcefile(type(task))
|
||||
|
||||
# Dump task config to file
|
||||
mmengine.mkdir_or_exist('tmp/')
|
||||
@ -116,12 +115,11 @@ class LocalRunner(BaseRunner):
|
||||
task.cfg.dump(param_file)
|
||||
|
||||
# Build up slurm command
|
||||
task_cmd_template = task.get_command_template()
|
||||
task_cmd = task_cmd_template.replace('{SCRIPT_PATH}',
|
||||
script_path).replace(
|
||||
'{CFG_PATH}', param_file)
|
||||
cmd = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids) + ' '
|
||||
cmd += task_cmd
|
||||
tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
|
||||
tmpl += ' {task_cmd}'
|
||||
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
|
||||
cmd = get_cmd()
|
||||
|
||||
logger = get_logger()
|
||||
logger.debug(f'Running command: {cmd}')
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
import inspect
|
||||
import os
|
||||
import os.path as osp
|
||||
import random
|
||||
import subprocess
|
||||
import time
|
||||
from functools import partial
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
import mmengine
|
||||
@ -85,7 +85,6 @@ class SlurmRunner(BaseRunner):
|
||||
task = task_type(task_cfg)
|
||||
num_gpus = task.num_gpus
|
||||
task_name = task.name
|
||||
script_path = inspect.getsourcefile(task_type)
|
||||
|
||||
# Dump task config to file
|
||||
mmengine.mkdir_or_exist('tmp/')
|
||||
@ -93,18 +92,17 @@ class SlurmRunner(BaseRunner):
|
||||
task_cfg.dump(param_file)
|
||||
|
||||
# Build up slurm command
|
||||
task_cmd_template = task.get_command_template()
|
||||
task_cmd = task_cmd_template.replace('{SCRIPT_PATH}',
|
||||
script_path).replace(
|
||||
'{CFG_PATH}', param_file)
|
||||
cmd = 'srun'
|
||||
tmpl = 'srun'
|
||||
if self.partition:
|
||||
cmd += f' -p {self.partition}'
|
||||
tmpl += f' -p {self.partition}'
|
||||
if self.quotatype:
|
||||
cmd += f' --quotatype={self.quotatype}'
|
||||
tmpl += f' --quotatype={self.quotatype}'
|
||||
if num_gpus > 0:
|
||||
cmd += f' --gres=gpu:{num_gpus}'
|
||||
cmd += f" -N1 -J '{task_name[:512]}' {task_cmd}"
|
||||
tmpl += f' --gres=gpu:{num_gpus}'
|
||||
tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
|
||||
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
|
||||
cmd = get_cmd()
|
||||
|
||||
logger = get_logger()
|
||||
logger.debug(f'Running command: {cmd}')
|
||||
|
||||
@ -130,6 +128,8 @@ class SlurmRunner(BaseRunner):
|
||||
retry -= 1
|
||||
if random_sleep:
|
||||
time.sleep(random.randint(0, 10))
|
||||
# Re-generate command to refresh ports.
|
||||
cmd = get_cmd()
|
||||
result = subprocess.run(cmd,
|
||||
shell=True,
|
||||
text=True,
|
||||
|
@ -10,7 +10,7 @@ from opencompass.utils import get_infer_output_path, task_abbr_from_cfg
|
||||
class BaseTask:
|
||||
"""Base class for all tasks. There are two ways to run the task:
|
||||
1. Directly by calling the `run` method.
|
||||
2. Calling the `get_command_template` method to get the command template,
|
||||
2. Calling the `get_command` method to get the command,
|
||||
and then run the command in the shell.
|
||||
|
||||
Args:
|
||||
@ -35,15 +35,13 @@ class BaseTask:
|
||||
"""Run the task."""
|
||||
|
||||
@abstractmethod
|
||||
def get_command_template(self) -> str:
|
||||
def get_command(self, cfg_path, template) -> str:
|
||||
"""Get the command template for the task.
|
||||
|
||||
The command template should
|
||||
contain the following placeholders:
|
||||
1. ``{SCRIPT_PATH}``: This placeholder will be replaced by the path to
|
||||
the script file of the task.
|
||||
2. ``{CFG_PATH}`` This placeholder will be replaced by the
|
||||
path to the config file of the task.
|
||||
Args:
|
||||
cfg_path (str): The path to the config file of the task.
|
||||
template (str): The template which have '{task_cmd}' to format
|
||||
the command.
|
||||
"""
|
||||
|
||||
@property
|
||||
|
@ -31,8 +31,10 @@ class OpenICLEvalTask(BaseTask):
|
||||
self.num_gpus = 0
|
||||
self.logger = get_logger()
|
||||
|
||||
def get_command_template(self):
|
||||
return 'python3 {SCRIPT_PATH} {CFG_PATH}'
|
||||
def get_command(self, cfg_path, template):
|
||||
script_path = __file__
|
||||
command = f'python3 {script_path} {cfg_path}'
|
||||
return template.format(task_cmd=command)
|
||||
|
||||
def run(self):
|
||||
for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
|
||||
|
@ -31,13 +31,24 @@ class OpenICLInferTask(BaseTask):
|
||||
self.num_gpus = run_cfg.get('num_gpus', 0)
|
||||
self.num_procs = run_cfg.get('num_procs', 1)
|
||||
|
||||
def get_command_template(self):
|
||||
def get_command(self, cfg_path, template):
|
||||
"""Get the command template for the task.
|
||||
|
||||
Args:
|
||||
cfg_path (str): The path to the config file of the task.
|
||||
template (str): The template which have '{task_cmd}' to format
|
||||
the command.
|
||||
"""
|
||||
script_path = __file__
|
||||
if self.num_gpus > 0:
|
||||
return (f'torchrun --master_port={random.randint(12000, 32000)} '
|
||||
port = random.randint(12000, 32000)
|
||||
command = (f'torchrun --master_port={port} '
|
||||
f'--nproc_per_node {self.num_procs} '
|
||||
'{SCRIPT_PATH} {CFG_PATH}')
|
||||
f'{script_path} {cfg_path}')
|
||||
else:
|
||||
return ('python {SCRIPT_PATH} {CFG_PATH}')
|
||||
command = 'python {script_path} {cfg_path}'
|
||||
|
||||
return template.format(task_cmd=command)
|
||||
|
||||
def run(self):
|
||||
for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
|
||||
|
Loading…
Reference in New Issue
Block a user