[Fix] Fix local debug mode not restrict the resources (#522)

* [Fix] fix local debug mode not restrict the resources * minor fix
2025-05-30 16:03:24 +08:00 · 2023-10-30 18:13:43 +08:00 · 2023-10-30 18:13:43 +08:00 · b9270c3a60
commit b9270c3a60
parent 229a65f305
1 changed files with 35 additions and 23 deletions
--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@ -2,6 +2,7 @@ import os
 import os.path as osp
 import re
 import subprocess
 import sys
 import time
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
@ -19,6 +20,18 @@ from opencompass.utils import get_logger
 from .base import BaseRunner
 def get_command_template(gpu_ids: List[int]) -> str:
    """Format command template given available gpu ids."""
    if sys.platform == 'win32':  # Always return win32 for Windows
        # use command in Windows format
        tmpl = 'set CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
        tmpl += ' & {task_cmd}'
    else:
        tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
        tmpl += ' {task_cmd}'
    return tmpl
@RUNNERS.register_module()
 class LocalRunner(BaseRunner):
    """Local runner. Start tasks by local python.
@ -55,17 +68,36 @@ class LocalRunner(BaseRunner):
        """
        status = []
        import torch
        if 'CUDA_VISIBLE_DEVICES' in os.environ:
            all_gpu_ids = [
                int(i) for i in re.findall(r'(?<!-)\d+',
                                           os.getenv('CUDA_VISIBLE_DEVICES'))
            ]
        else:
            all_gpu_ids = list(range(torch.cuda.device_count()))
        if self.debug:
            for task in tasks:
                task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))
                task_name = task.name
                num_gpus = task.num_gpus
                assert len(all_gpu_ids) >= num_gpus
                # get cmd
                mmengine.mkdir_or_exist('tmp/')
                param_file = f'tmp/{os.getpid()}_params.py'
                try:
                    task.cfg.dump(param_file)
-                    cmd = task.get_command(cfg_path=param_file,
+                    # if use torchrun, restrict it behaves the same as non
-                                           template='{task_cmd}')
+                    # debug mode, otherwise, the torchrun will use all the
                    # available resources which might cause inconsistent
                    # behavior.
                    if len(all_gpu_ids) > num_gpus and num_gpus > 0:
                        get_logger().warning(f'Only use {num_gpus} GPUs for '
                                             f'total {len(all_gpu_ids)} '
                                             'available GPUs in debug mode.')
                    tmpl = get_command_template(all_gpu_ids[:num_gpus])
                    cmd = task.get_command(cfg_path=param_file, template=tmpl)
                    # run in subprocess if starts with torchrun etc.
                    if cmd.startswith('python'):
                        task.run()
@ -75,15 +107,6 @@ class LocalRunner(BaseRunner):
                    os.remove(param_file)
                status.append((task_name, 0))
        else:
            import torch
            if 'CUDA_VISIBLE_DEVICES' in os.environ:
                all_gpu_ids = [
                    int(i) for i in re.findall(
                        r'(?<!-)\d+', os.getenv('CUDA_VISIBLE_DEVICES'))
                ]
            else:
                all_gpu_ids = list(range(torch.cuda.device_count()))
            if len(all_gpu_ids) > 0:
                gpus = np.zeros(max(all_gpu_ids) + 1, dtype=np.uint)
                gpus[all_gpu_ids] = self.max_workers_per_gpu
@ -145,18 +168,7 @@ class LocalRunner(BaseRunner):
        param_file = f'tmp/{os.getpid()}_{index}_params.py'
        try:
            task.cfg.dump(param_file)
-
+            tmpl = get_command_template(gpu_ids)
            # Build up local command
            import sys
            if sys.platform == 'win32':  # Always return win32 for Windows
                # use command in Windows format
                tmpl = 'set CUDA_VISIBLE_DEVICES=' + ','.join(
                    str(i) for i in gpu_ids)
                tmpl += ' & {task_cmd}'
            else:
                tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(
                    str(i) for i in gpu_ids)
                tmpl += ' {task_cmd}'
            get_cmd = partial(task.get_command,
                              cfg_path=param_file,
                              template=tmpl)