From 86503894f9b281a25a319a12d0e4e4f90b362d5f Mon Sep 17 00:00:00 2001 From: BIGWangYuDong Date: Mon, 28 Apr 2025 18:10:32 +0800 Subject: [PATCH 1/2] api: add `tmp_dir` in runners to avoid hardcode --- opencompass/runners/base.py | 5 ++++- opencompass/runners/dlc.py | 14 +++++++++++--- opencompass/runners/local.py | 25 +++++++++++++++++-------- opencompass/runners/local_api.py | 15 +++++++++++---- opencompass/runners/slurm.py | 15 +++++++++++---- opencompass/runners/slurm_sequential.py | 17 +++++++++++++---- opencompass/runners/volc.py | 19 ++++++++++++++----- 7 files changed, 81 insertions(+), 29 deletions(-) diff --git a/opencompass/runners/base.py b/opencompass/runners/base.py index 3cd2820f..67e545db 100644 --- a/opencompass/runners/base.py +++ b/opencompass/runners/base.py @@ -15,18 +15,21 @@ class BaseRunner: task (ConfigDict): Task type config. debug (bool): Whether to run in debug mode. lark_bot_url (str): Lark bot url. + tmp_dir (str): The directory to store temporary files. """ def __init__(self, task: ConfigDict, debug: bool = False, - lark_bot_url: str = None): + lark_bot_url: str = None, + tmp_dir: str = 'tmp'): self.task_cfg = Config(task) self.debug = debug if lark_bot_url: self.lark_reporter = LarkReporter(lark_bot_url) else: self.lark_reporter = None + self.tmp_dir = tmp_dir def __call__(self, tasks: List[Dict[str, Any]]): """Launch multiple tasks and summarize the results. diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 44e9fd00..9ed254d9 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -33,6 +33,9 @@ class DLCRunner(BaseRunner): retry (int): Number of retries when job failed. Default: 2. debug (bool): Whether to run in debug mode. Default: False. lark_bot_url (str): Lark bot url. Default: None. + keep_tmp_file (bool): Whether to keep the temporary file. + Default: True. + tmp_dir (str): The directory to store temporary files. Default: 'tmp'. """ def __init__( @@ -45,8 +48,12 @@ class DLCRunner(BaseRunner): debug: bool = False, lark_bot_url: str = None, keep_tmp_file: bool = True, + tmp_dir: str = 'tmp', ): - super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) + super().__init__(task=task, + debug=debug, + lark_bot_url=lark_bot_url, + tmp_dir=tmp_dir) self.aliyun_cfg = aliyun_cfg self.max_num_workers = max_num_workers self.retry = retry @@ -114,12 +121,13 @@ class DLCRunner(BaseRunner): break # Dump task config to file - mmengine.mkdir_or_exist('tmp/') + mmengine.mkdir_or_exist(self.tmp_dir) # Using uuid to avoid filename conflict import uuid uuid_str = str(uuid.uuid4()) - param_file = f'tmp/{uuid_str}_params.py' + param_file = f'{uuid_str}_params.py' + param_file = osp.join(self.tmp_dir, param_file) pwd = os.getcwd() try: cfg.dump(param_file) diff --git a/opencompass/runners/local.py b/opencompass/runners/local.py index 1b14ad17..21a1a91d 100644 --- a/opencompass/runners/local.py +++ b/opencompass/runners/local.py @@ -48,6 +48,10 @@ class LocalRunner(BaseRunner): Defaults to 1. debug (bool): Whether to run in debug mode. lark_bot_url (str): Lark bot url. + keep_tmp_file (bool): Whether to keep the temporary file. Defaults to + False. + tmp_dir (str): The directory to store temporary files. + Defaults to 'tmp'. """ def __init__(self, @@ -57,8 +61,12 @@ class LocalRunner(BaseRunner): max_workers_per_gpu: int = 1, lark_bot_url: str = None, keep_tmp_file: bool = False, + tmp_dir: str = 'tmp', **kwargs): - super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) + super().__init__(task=task, + debug=debug, + lark_bot_url=lark_bot_url, + tmp_dir=tmp_dir) self.max_num_workers = max_num_workers self.max_workers_per_gpu = max_workers_per_gpu self.keep_tmp_file = keep_tmp_file @@ -101,11 +109,12 @@ class LocalRunner(BaseRunner): num_gpus = task.num_gpus assert len(all_gpu_ids) >= num_gpus # get cmd - mmengine.mkdir_or_exist('tmp/') + mmengine.mkdir_or_exist(self.tmp_dir) import uuid uuid_str = str(uuid.uuid4()) - param_file = f'tmp/{uuid_str}_params.py' + param_file = f'{uuid_str}_params.py' + param_file = osp.join(self.tmp_dir, param_file) try: task.cfg.dump(param_file) # if use torchrun, restrict it behaves the same as non @@ -135,7 +144,8 @@ class LocalRunner(BaseRunner): else: task.run() else: - tmp_logs = f'tmp/{os.getpid()}_debug.log' + tmp_logs = f'{os.getpid()}_debug.log' + tmp_logs = osp.join(self.tmp_dir, tmp_logs) get_logger().warning( f'Debug mode, log will be saved to {tmp_logs}') with open(tmp_logs, 'a') as log_file: @@ -207,14 +217,13 @@ class LocalRunner(BaseRunner): task_name = task.name - pwd = os.getcwd() # Dump task config to file - mmengine.mkdir_or_exist('tmp/') + mmengine.mkdir_or_exist(self.tmp_dir) # Using uuid to avoid filename conflict import uuid uuid_str = str(uuid.uuid4()) - param_file = f'{pwd}/tmp/{uuid_str}_params.py' - + param_file = f'{uuid_str}_params.py' + param_file = osp.join(self.tmp_dir, param_file) try: task.cfg.dump(param_file) tmpl = get_command_template(gpu_ids) diff --git a/opencompass/runners/local_api.py b/opencompass/runners/local_api.py index 24874423..5f1f19da 100644 --- a/opencompass/runners/local_api.py +++ b/opencompass/runners/local_api.py @@ -161,6 +161,8 @@ class LocalAPIRunner(BaseRunner): Defaults to 16. debug (bool): Whether to run in debug mode. lark_bot_url (str): Lark bot url. + tmp_dir (str): The directory to store temporary files. + Defaults to 'tmp'. """ def __init__(self, @@ -168,8 +170,12 @@ class LocalAPIRunner(BaseRunner): concurrent_users: int, max_num_workers: int = 16, debug: bool = False, - lark_bot_url: str = None): - super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) + lark_bot_url: str = None, + tmp_dir: str = 'tmp'): + super().__init__(task=task, + debug=debug, + lark_bot_url=lark_bot_url, + tmp_dir=tmp_dir) self.max_num_workers = max_num_workers self.concurrent_users = concurrent_users assert task['type'] in [ @@ -194,8 +200,9 @@ class LocalAPIRunner(BaseRunner): task = TASKS.build(dict(cfg=task, type=self.task_cfg['type'])) task_name = task.name # get cmd - mmengine.mkdir_or_exist('tmp/') - param_file = f'tmp/{os.getpid()}_params.py' + mmengine.mkdir_or_exist(self.tmp_dir) + param_file = f'{os.getpid()}_params.py' + param_file = osp.join(self.tmp_dir, param_file) try: task.cfg.dump(param_file) cmd = task.get_command(cfg_path=param_file, diff --git a/opencompass/runners/slurm.py b/opencompass/runners/slurm.py index e882a4c9..31c1b3f6 100644 --- a/opencompass/runners/slurm.py +++ b/opencompass/runners/slurm.py @@ -33,6 +33,8 @@ class SlurmRunner(BaseRunner): lark_bot_url (str): Lark bot url. Defaults to None. extra_command (List, optional): Extra slurm command. For example ['-c 12', '-w node1']. Defaults to None. + tmp_dir (str): The directory to store temporary files. + Defaults to 'tmp'. """ def __init__(self, @@ -44,8 +46,12 @@ class SlurmRunner(BaseRunner): qos: str = None, debug: bool = False, lark_bot_url: str = None, - extra_command: Optional[List[str]] = None): - super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) + extra_command: Optional[List[str]] = None, + tmp_dir: str = 'tmp'): + super().__init__(task=task, + debug=debug, + lark_bot_url=lark_bot_url, + tmp_dir=tmp_dir) self.max_num_workers = max_num_workers self.retry = retry self.partition = partition @@ -93,8 +99,9 @@ class SlurmRunner(BaseRunner): task_name = task.name # Dump task config to file - mmengine.mkdir_or_exist('tmp/') - param_file = f'tmp/{os.getpid()}_params.py' + mmengine.mkdir_or_exist(self.tmp_dir) + param_file = f'{os.getpid()}_params.py' + param_file = osp.join(self.tmp_dir, param_file) try: cfg.dump(param_file) diff --git a/opencompass/runners/slurm_sequential.py b/opencompass/runners/slurm_sequential.py index 5dee149c..5afdf0c1 100644 --- a/opencompass/runners/slurm_sequential.py +++ b/opencompass/runners/slurm_sequential.py @@ -47,6 +47,10 @@ class SlurmSequentialRunner(BaseRunner): lark_bot_url (str): Lark bot url. Defaults to None. extra_command (List, optional): Extra slurm command. For example ['-c 12', '-w node1']. Defaults to None. + keep_tmp_file (bool): Whether to keep the temporary file. Defaults to + False. + tmp_dir (str): The directory to store temporary files. + Defaults to 'tmp'. """ def __init__(self, @@ -60,8 +64,12 @@ class SlurmSequentialRunner(BaseRunner): debug: bool = False, lark_bot_url: str = None, extra_command: Optional[List[str]] = None, - keep_tmp_file: bool = False): - super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) + keep_tmp_file: bool = False, + tmp_dir: str = 'tmp'): + super().__init__(task=task, + debug=debug, + lark_bot_url=lark_bot_url, + tmp_dir=tmp_dir) self.max_num_workers = max_num_workers self.retry = retry self.partition = partition @@ -172,11 +180,12 @@ class SlurmSequentialRunner(BaseRunner): task_name = self.task_prefix + task_name # Dump task config to file - mmengine.mkdir_or_exist('tmp/') + mmengine.mkdir_or_exist(self.tmp_dir) # Using uuid to avoid filename conflict import uuid uuid_str = str(uuid.uuid4()) - param_file = f'tmp/{uuid_str}_params.py' + param_file = f'{uuid_str}_params.py' + param_file = osp.join(self.tmp_dir, param_file) process = None try: cfg.dump(param_file) diff --git a/opencompass/runners/volc.py b/opencompass/runners/volc.py index 37cd441b..a1f20bd4 100644 --- a/opencompass/runners/volc.py +++ b/opencompass/runners/volc.py @@ -36,6 +36,9 @@ class VOLCRunner(BaseRunner): retry (int): Number of retries when job failed. Default: 2. debug (bool): Whether to run in debug mode. Default: False. lark_bot_url (str): Lark bot url. Default: None. + keep_tmp_file (bool): Whether to keep the temporary file. Default: + False. + tmp_dir (str): The directory to store temporary files. Default: 'tmp'. """ def __init__(self, @@ -48,8 +51,12 @@ class VOLCRunner(BaseRunner): retry: int = 2, debug: bool = False, lark_bot_url: str = None, - keep_tmp_file: bool = True): - super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) + keep_tmp_file: bool = True, + tmp_dir: str = 'tmp'): + super().__init__(task=task, + debug=debug, + lark_bot_url=lark_bot_url, + tmp_dir=tmp_dir) self.volcano_cfg = volcano_cfg self.max_num_workers = max_num_workers self.retry = retry @@ -101,13 +108,15 @@ class VOLCRunner(BaseRunner): # Build up VCC command pwd = os.getcwd() # Dump task config to file - mmengine.mkdir_or_exist('tmp/') + mmengine.mkdir_or_exist(self.tmp_dir) # Using uuid to avoid filename conflict import uuid uuid_str = str(uuid.uuid4()) - param_file = f'{pwd}/tmp/{uuid_str}_params.py' + param_file = f'{uuid_str}_params.py' + param_file = osp.join(self.tmp_dir, param_file) - volc_cfg_file = f'{pwd}/tmp/{uuid_str}_cfg.yaml' + volc_cfg_file = f'{uuid_str}_cfg.yaml' + volc_cfg_file = osp.join(self.tmp_dir, volc_cfg_file) volc_cfg = self._choose_flavor(num_gpus) with open(volc_cfg_file, 'w') as fp: yaml.dump(volc_cfg, fp, sort_keys=False) From 642cd2839be2e1ac0675540430ac56351764c5bc Mon Sep 17 00:00:00 2001 From: BIGWangYuDong Date: Mon, 28 Apr 2025 18:20:45 +0800 Subject: [PATCH 2/2] fix cache hardcode in partitioners --- opencompass/partitioners/num_worker.py | 3 ++- opencompass/partitioners/size.py | 3 ++- opencompass/partitioners/sub_num_worker.py | 3 ++- opencompass/partitioners/sub_size.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/opencompass/partitioners/num_worker.py b/opencompass/partitioners/num_worker.py index e916a17d..0712dc3d 100644 --- a/opencompass/partitioners/num_worker.py +++ b/opencompass/partitioners/num_worker.py @@ -141,7 +141,8 @@ class NumWorkerPartitioner(BasePartitioner): dataset = build_dataset_from_cfg(dataset) self.dataset_size[dataset_abbr] = len(dataset.test) - mmengine.mkdir_or_exist('.cache/') + tmp_dir_root = osp.dirname(self.dataset_size_path) + mmengine.mkdir_or_exist(tmp_dir_root) mmengine.dump(self.dataset_size, self.dataset_size_path, indent=4, diff --git a/opencompass/partitioners/size.py b/opencompass/partitioners/size.py index 10e64a2f..30eead75 100644 --- a/opencompass/partitioners/size.py +++ b/opencompass/partitioners/size.py @@ -214,7 +214,8 @@ class SizePartitioner(BasePartitioner): dataset = build_dataset_from_cfg(dataset) self.dataset_size[dataset_abbr] = len(dataset.test) - mmengine.mkdir_or_exist('.cache/') + tmp_dir_root = osp.dirname(self.dataset_size_path) + mmengine.mkdir_or_exist(tmp_dir_root) mmengine.dump(self.dataset_size, self.dataset_size_path, indent=4, diff --git a/opencompass/partitioners/sub_num_worker.py b/opencompass/partitioners/sub_num_worker.py index 13260858..70d10277 100644 --- a/opencompass/partitioners/sub_num_worker.py +++ b/opencompass/partitioners/sub_num_worker.py @@ -198,7 +198,8 @@ class SubjectiveNumWorkerPartitioner(SubjectiveNaivePartitioner): dataset = build_dataset_from_cfg(dataset) self.dataset_size[dataset_abbr] = len(dataset.test) - mmengine.mkdir_or_exist('.cache/') + tmp_dir_root = osp.dirname(self.dataset_size_path) + mmengine.mkdir_or_exist(tmp_dir_root) mmengine.dump(self.dataset_size, self.dataset_size_path, indent=4, diff --git a/opencompass/partitioners/sub_size.py b/opencompass/partitioners/sub_size.py index 1c68b6f9..be3cb9e7 100644 --- a/opencompass/partitioners/sub_size.py +++ b/opencompass/partitioners/sub_size.py @@ -274,7 +274,8 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner): dataset = build_dataset_from_cfg(dataset) self.dataset_size[dataset_abbr] = len(dataset.test) - mmengine.mkdir_or_exist('.cache/') + tmp_dir_root = osp.dirname(self.dataset_size_path) + mmengine.mkdir_or_exist(tmp_dir_root) mmengine.dump(self.dataset_size, self.dataset_size_path, indent=4,