OpenCompass/opencompass/runners/local.py

import os
import os.path as osp
import re
import subprocess
import sys
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from threading import Lock
from typing import Any, Dict, List, Tuple

import mmengine
import numpy as np
from mmengine.config import ConfigDict
from tqdm import tqdm

from opencompass.registry import RUNNERS, TASKS
from opencompass.utils import get_logger

from .base import BaseRunner


def get_command_template(gpu_ids: List[int]) -> str:
    """Format command template given available gpu ids."""
    if sys.platform == 'win32':  # Always return win32 for Windows
        # use command in Windows format
        tmpl = 'set CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
        tmpl += ' & {task_cmd}'
    else:
        tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
        tmpl += ' {task_cmd}'
    return tmpl


@RUNNERS.register_module()
class LocalRunner(BaseRunner):
    """Local runner. Start tasks by local python.

    Args:
        task (ConfigDict): Task type config.
        max_num_workers (int): Max number of workers to run in parallel.
            Defaults to 16.
        max_workers_per_gpu (int): Max number of workers to run for one GPU.
            Defaults to 1.
        debug (bool): Whether to run in debug mode.
        lark_bot_url (str): Lark bot url.
    """

    def __init__(
        self,
        task: ConfigDict,
        max_num_workers: int = 16,
        debug: bool = False,
        max_workers_per_gpu: int = 1,
        lark_bot_url: str = None,
    ):
        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
        self.max_num_workers = max_num_workers
        self.max_workers_per_gpu = max_workers_per_gpu

    def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
        """Launch multiple tasks.

        Args:
            tasks (list[dict]): A list of task configs, usually generated by
                Partitioner.

        Returns:
            list[tuple[str, int]]: A list of (task name, exit code).
        """

        status = []
        import torch

        if 'CUDA_VISIBLE_DEVICES' in os.environ:
            all_gpu_ids = [
                int(i) for i in re.findall(r'(?<!-)\d+',
                                           os.getenv('CUDA_VISIBLE_DEVICES'))
            ]
        else:
            all_gpu_ids = list(range(torch.cuda.device_count()))

        if self.debug:
            for task in tasks:
                task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))
                task_name = task.name
                num_gpus = task.num_gpus
                assert len(all_gpu_ids) >= num_gpus
                # get cmd
                mmengine.mkdir_or_exist('tmp/')
                param_file = f'tmp/{os.getpid()}_params.py'
                try:
                    task.cfg.dump(param_file)
                    # if use torchrun, restrict it behaves the same as non
                    # debug mode, otherwise, the torchrun will use all the
                    # available resources which might cause inconsistent
                    # behavior.
                    if len(all_gpu_ids) > num_gpus and num_gpus > 0:
                        get_logger().warning(f'Only use {num_gpus} GPUs for '
                                             f'total {len(all_gpu_ids)} '
                                             'available GPUs in debug mode.')
                    tmpl = get_command_template(all_gpu_ids[:num_gpus])
                    cmd = task.get_command(cfg_path=param_file, template=tmpl)
                    # run in subprocess if starts with torchrun etc.
                    if 'python3 ' in cmd or 'python ' in cmd:
                        # If it is an infer type task do not reload if
                        # the current model has already been loaded.
                        if 'infer' in self.task_cfg.type.lower():
                            # If a model instance already exists,
                            # do not reload it.
                            if hasattr(self, 'cur_model'):
                                task.run(self.cur_model)
                            else:
                                task.run()
                            self.cur_model = task.model
                        else:
                            task.run()
                    else:
                        subprocess.run(cmd, shell=True, text=True)
                finally:
                    os.remove(param_file)
                status.append((task_name, 0))
        else:
            if len(all_gpu_ids) > 0:
                gpus = np.zeros(max(all_gpu_ids) + 1, dtype=np.uint)
                gpus[all_gpu_ids] = self.max_workers_per_gpu
            else:
                gpus = np.array([], dtype=np.uint)

            pbar = tqdm(total=len(tasks))
            lock = Lock()

            def submit(task, index):
                task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))
                num_gpus = task.num_gpus
                assert len(gpus) >= num_gpus

                while True:
                    lock.acquire()
                    if sum(gpus > 0) >= num_gpus:
                        gpu_ids = np.where(gpus)[0][:num_gpus]
                        gpus[gpu_ids] -= 1
                        lock.release()
                        break
                    lock.release()
                    time.sleep(1)

                if num_gpus > 0:
                    tqdm.write(f'launch {task.name} on GPU ' +
                               ','.join(map(str, gpu_ids)))
                else:
                    tqdm.write(f'launch {task.name} on CPU ')

                res = self._launch(task, gpu_ids, index)
                pbar.update()

                with lock:
                    gpus[gpu_ids] += 1

                return res

            with ThreadPoolExecutor(
                    max_workers=self.max_num_workers) as executor:
                status = executor.map(submit, tasks, range(len(tasks)))

        return status

    def _launch(self, task, gpu_ids, index):
        """Launch a single task.

        Args:
            task (BaseTask): Task to launch.

        Returns:
            tuple[str, int]: Task name and exit code.
        """

        task_name = task.name

        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_{index}_params.py'
        try:
            task.cfg.dump(param_file)
            tmpl = get_command_template(gpu_ids)
            get_cmd = partial(task.get_command,
                              cfg_path=param_file,
                              template=tmpl)
            cmd = get_cmd()

            logger = get_logger()
            logger.debug(f'Running command: {cmd}')

            # Run command
            out_path = task.get_log_path(file_extension='out')
            mmengine.mkdir_or_exist(osp.split(out_path)[0])
            stdout = open(out_path, 'w', encoding='utf-8')

            result = subprocess.run(cmd,
                                    shell=True,
                                    text=True,
                                    stdout=stdout,
                                    stderr=stdout)

            if result.returncode != 0:
                logger.error(f'task {task_name} fail, see\n{out_path}')
        finally:
            # Clean up
            os.remove(param_file)
        return task_name, result.returncode
initial commit 2023-07-04 21:34:55 +08:00			`import os`
			`import os.path as osp`
[Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU (#148) * [Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU * Fix UT * Update according to comments 2023-08-10 16:53:03 +08:00			`import re`
initial commit 2023-07-04 21:34:55 +08:00			`import subprocess`
[Fix] Fix local debug mode not restrict the resources (#522) * [Fix] fix local debug mode not restrict the resources * minor fix 2023-10-30 18:13:43 +08:00			`import sys`
initial commit 2023-07-04 21:34:55 +08:00			`import time`
			`from concurrent.futures import ThreadPoolExecutor`
Auto re-generate port number during retry (#24) * Auto re-generate port number during retry * Fix slurm command 2023-07-07 17:25:56 +08:00			`from functools import partial`
initial commit 2023-07-04 21:34:55 +08:00			`from threading import Lock`
			`from typing import Any, Dict, List, Tuple`

			`import mmengine`
			`import numpy as np`
			`from mmengine.config import ConfigDict`
			`from tqdm import tqdm`

			`from opencompass.registry import RUNNERS, TASKS`
			`from opencompass.utils import get_logger`

			`from .base import BaseRunner`


[Fix] Fix local debug mode not restrict the resources (#522) * [Fix] fix local debug mode not restrict the resources * minor fix 2023-10-30 18:13:43 +08:00			`def get_command_template(gpu_ids: List[int]) -> str:`
			`"""Format command template given available gpu ids."""`
			`if sys.platform == 'win32': # Always return win32 for Windows`
			`# use command in Windows format`
			`tmpl = 'set CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)`
			`tmpl += ' & {task_cmd}'`
			`else:`
			`tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)`
			`tmpl += ' {task_cmd}'`
			`return tmpl`


initial commit 2023-07-04 21:34:55 +08:00			`@RUNNERS.register_module()`
			`class LocalRunner(BaseRunner):`
			`"""Local runner. Start tasks by local python.`

			`Args:`
			`task (ConfigDict): Task type config.`
			`max_num_workers (int): Max number of workers to run in parallel.`
			`Defaults to 16.`
[Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU (#148) * [Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU * Fix UT * Update according to comments 2023-08-10 16:53:03 +08:00			`max_workers_per_gpu (int): Max number of workers to run for one GPU.`
			`Defaults to 1.`
initial commit 2023-07-04 21:34:55 +08:00			`debug (bool): Whether to run in debug mode.`
			`lark_bot_url (str): Lark bot url.`
			`"""`

[Fix] Fixed repeated loading of VLLM (#1051) * [fix]Fixed the issue caused by the repeated loading of VLLM model during task segmentation. * [fix] avoid TypeError: VLLM.__init__() got an unexpected keyword argument 'tokenizer_only' * restore .pre-commit-config.yaml * restore opencompass/tasks/openicl_infer.py --------- Co-authored-by: IcyFeather <mengzhuo.happy@gmail.com> Co-authored-by: Leymore <zfz-960727@163.com> 2024-04-17 20:36:08 +08:00			`def __init__(`
			`self,`
			`task: ConfigDict,`
			`max_num_workers: int = 16,`
			`debug: bool = False,`
			`max_workers_per_gpu: int = 1,`
			`lark_bot_url: str = None,`
			`):`
initial commit 2023-07-04 21:34:55 +08:00			`super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)`
			`self.max_num_workers = max_num_workers`
[Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU (#148) * [Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU * Fix UT * Update according to comments 2023-08-10 16:53:03 +08:00			`self.max_workers_per_gpu = max_workers_per_gpu`
initial commit 2023-07-04 21:34:55 +08:00
			`def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:`
			`"""Launch multiple tasks.`

			`Args:`
			`tasks (list[dict]): A list of task configs, usually generated by`
			`Partitioner.`

			`Returns:`
			`list[tuple[str, int]]: A list of (task name, exit code).`
			`"""`

			`status = []`
[Fix] Fix local debug mode not restrict the resources (#522) * [Fix] fix local debug mode not restrict the resources * minor fix 2023-10-30 18:13:43 +08:00			`import torch`
[Fix] Fixed repeated loading of VLLM (#1051) * [fix]Fixed the issue caused by the repeated loading of VLLM model during task segmentation. * [fix] avoid TypeError: VLLM.__init__() got an unexpected keyword argument 'tokenizer_only' * restore .pre-commit-config.yaml * restore opencompass/tasks/openicl_infer.py --------- Co-authored-by: IcyFeather <mengzhuo.happy@gmail.com> Co-authored-by: Leymore <zfz-960727@163.com> 2024-04-17 20:36:08 +08:00
[Fix] Fix local debug mode not restrict the resources (#522) * [Fix] fix local debug mode not restrict the resources * minor fix 2023-10-30 18:13:43 +08:00			`if 'CUDA_VISIBLE_DEVICES' in os.environ:`
			`all_gpu_ids = [`
			`int(i) for i in re.findall(r'(?<!-)\d+',`
			`os.getenv('CUDA_VISIBLE_DEVICES'))`
			`]`
			`else:`
			`all_gpu_ids = list(range(torch.cuda.device_count()))`

initial commit 2023-07-04 21:34:55 +08:00			`if self.debug:`
			`for task in tasks:`
[Sync] Initial support of subjective evaluation (#421) Co-authored-by: Leymore <zfz-960727@163.com> 2023-09-22 15:42:31 +08:00			`task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))`
initial commit 2023-07-04 21:34:55 +08:00			`task_name = task.name`
[Fix] Fix local debug mode not restrict the resources (#522) * [Fix] fix local debug mode not restrict the resources * minor fix 2023-10-30 18:13:43 +08:00			`num_gpus = task.num_gpus`
			`assert len(all_gpu_ids) >= num_gpus`
[Fix] local runner debug (#238) 2023-08-21 16:58:36 +08:00			`# get cmd`
			`mmengine.mkdir_or_exist('tmp/')`
			`param_file = f'tmp/{os.getpid()}_params.py'`
[Sync] Use finally to clean up temp files (#337) 2023-09-04 15:20:16 +08:00			`try:`
			`task.cfg.dump(param_file)`
[Fix] Fix local debug mode not restrict the resources (#522) * [Fix] fix local debug mode not restrict the resources * minor fix 2023-10-30 18:13:43 +08:00			`# if use torchrun, restrict it behaves the same as non`
			`# debug mode, otherwise, the torchrun will use all the`
			`# available resources which might cause inconsistent`
			`# behavior.`
			`if len(all_gpu_ids) > num_gpus and num_gpus > 0:`
			`get_logger().warning(f'Only use {num_gpus} GPUs for '`
			`f'total {len(all_gpu_ids)} '`
			`'available GPUs in debug mode.')`
			`tmpl = get_command_template(all_gpu_ids[:num_gpus])`
			`cmd = task.get_command(cfg_path=param_file, template=tmpl)`
[Sync] Use finally to clean up temp files (#337) 2023-09-04 15:20:16 +08:00			`# run in subprocess if starts with torchrun etc.`
[Fix]Fixed the problem of never entering task.run() mode in local scheduling mode. (#930) * Fixed the problem of never entering task.run() mode in local scheduling mode. get_command_template方法中为命令行前缀添加了CUDA_VISIBLE_DEVICES=或set CUDA_VISIBLE_DEVICES=。导致task.run()分支失效。 --------- CUDA_VISIBLE_DEVICES= or set CUDA_VISIBLE_DEVICES= is added to the command line prefix in the get_command_template method. Causes the task.run() branch to fail. * [Fix]Fixed the problem of never entering task.run() mode in local scheduling mode. get_command_template方法中为命令行前缀添加了CUDA_VISIBLE_DEVICES=或set CUDA_VISIBLE_DEVICES=。导致task.run()分支失效。 --- CUDA_VISIBLE_DEVICES= or set CUDA_VISIBLE_DEVICES= is added to the command line prefix in the get_command_template method. Causes the task.run() branch to fail. * [Fix]Fixed the problem of never entering task.run() mode in local scheduling mode. get_command_template方法中为命令行前缀添加了CUDA_VISIBLE_DEVICES=或set CUDA_VISIBLE_DEVICES=。导致task.run()分支失效。 CUDA_VISIBLE_DEVICES= or set CUDA_VISIBLE_DEVICES= is added to the command line prefix in the get_command_template method. Causes the task.run() branch to fail. 2024-02-29 14:35:45 +08:00			`if 'python3 ' in cmd or 'python ' in cmd:`
[Fix] Fixed repeated loading of VLLM (#1051) * [fix]Fixed the issue caused by the repeated loading of VLLM model during task segmentation. * [fix] avoid TypeError: VLLM.__init__() got an unexpected keyword argument 'tokenizer_only' * restore .pre-commit-config.yaml * restore opencompass/tasks/openicl_infer.py --------- Co-authored-by: IcyFeather <mengzhuo.happy@gmail.com> Co-authored-by: Leymore <zfz-960727@163.com> 2024-04-17 20:36:08 +08:00			`# If it is an infer type task do not reload if`
			`# the current model has already been loaded.`
			`if 'infer' in self.task_cfg.type.lower():`
			`# If a model instance already exists,`
			`# do not reload it.`
			`if hasattr(self, 'cur_model'):`
			`task.run(self.cur_model)`
			`else:`
			`task.run()`
			`self.cur_model = task.model`
			`else:`
			`task.run()`
[Sync] Use finally to clean up temp files (#337) 2023-09-04 15:20:16 +08:00			`else:`
			`subprocess.run(cmd, shell=True, text=True)`
			`finally:`
			`os.remove(param_file)`
initial commit 2023-07-04 21:34:55 +08:00			`status.append((task_name, 0))`
			`else:`
[Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU (#148) * [Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU * Fix UT * Update according to comments 2023-08-10 16:53:03 +08:00			`if len(all_gpu_ids) > 0:`
			`gpus = np.zeros(max(all_gpu_ids) + 1, dtype=np.uint)`
			`gpus[all_gpu_ids] = self.max_workers_per_gpu`
			`else:`
			`gpus = np.array([], dtype=np.uint)`

initial commit 2023-07-04 21:34:55 +08:00			`pbar = tqdm(total=len(tasks))`
			`lock = Lock()`

			`def submit(task, index):`
[Sync] Initial support of subjective evaluation (#421) Co-authored-by: Leymore <zfz-960727@163.com> 2023-09-22 15:42:31 +08:00			`task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))`
initial commit 2023-07-04 21:34:55 +08:00			`num_gpus = task.num_gpus`
			`assert len(gpus) >= num_gpus`

			`while True:`
			`lock.acquire()`
[Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU (#148) * [Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU * Fix UT * Update according to comments 2023-08-10 16:53:03 +08:00			`if sum(gpus > 0) >= num_gpus:`
initial commit 2023-07-04 21:34:55 +08:00			`gpu_ids = np.where(gpus)[0][:num_gpus]`
[Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU (#148) * [Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU * Fix UT * Update according to comments 2023-08-10 16:53:03 +08:00			`gpus[gpu_ids] -= 1`
initial commit 2023-07-04 21:34:55 +08:00			`lock.release()`
			`break`
			`lock.release()`
			`time.sleep(1)`

			`if num_gpus > 0:`
			`tqdm.write(f'launch {task.name} on GPU ' +`
			`','.join(map(str, gpu_ids)))`
			`else:`
			`tqdm.write(f'launch {task.name} on CPU ')`

			`res = self._launch(task, gpu_ids, index)`
			`pbar.update()`

			`with lock:`
[Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU (#148) * [Feature] Support CUDA_VISIBLE_DEVICES and multiple tasks on one GPU * Fix UT * Update according to comments 2023-08-10 16:53:03 +08:00			`gpus[gpu_ids] += 1`
initial commit 2023-07-04 21:34:55 +08:00
			`return res`

			`with ThreadPoolExecutor(`
			`max_workers=self.max_num_workers) as executor:`
			`status = executor.map(submit, tasks, range(len(tasks)))`

			`return status`

			`def _launch(self, task, gpu_ids, index):`
			`"""Launch a single task.`

			`Args:`
			`task (BaseTask): Task to launch.`

			`Returns:`
			`tuple[str, int]: Task name and exit code.`
			`"""`

			`task_name = task.name`

			`# Dump task config to file`
			`mmengine.mkdir_or_exist('tmp/')`
Update start guide (#4) 2023-07-05 18:26:26 +08:00			`param_file = f'tmp/{os.getpid()}_{index}_params.py'`
[Sync] Use finally to clean up temp files (#337) 2023-09-04 15:20:16 +08:00			`try:`
			`task.cfg.dump(param_file)`
[Fix] Fix local debug mode not restrict the resources (#522) * [Fix] fix local debug mode not restrict the resources * minor fix 2023-10-30 18:13:43 +08:00			`tmpl = get_command_template(gpu_ids)`
[Sync] Use finally to clean up temp files (#337) 2023-09-04 15:20:16 +08:00			`get_cmd = partial(task.get_command,`
			`cfg_path=param_file,`
			`template=tmpl)`
			`cmd = get_cmd()`

			`logger = get_logger()`
			`logger.debug(f'Running command: {cmd}')`

			`# Run command`
			`out_path = task.get_log_path(file_extension='out')`
			`mmengine.mkdir_or_exist(osp.split(out_path)[0])`
			`stdout = open(out_path, 'w', encoding='utf-8')`

			`result = subprocess.run(cmd,`
			`shell=True,`
			`text=True,`
			`stdout=stdout,`
			`stderr=stdout)`

			`if result.returncode != 0:`
[Fix] Use logger.error on failure (#960) 2024-03-12 11:51:39 +08:00			`logger.error(f'task {task_name} fail, see\n{out_path}')`
[Sync] Use finally to clean up temp files (#337) 2023-09-04 15:20:16 +08:00			`finally:`
			`# Clean up`
			`os.remove(param_file)`
initial commit 2023-07-04 21:34:55 +08:00			`return task_name, result.returncode`