Merge 642cd2839b into 8c0ccf9a6b

2025-05-30 16:03:24 +08:00 · 2025-05-17 13:43:54 +08:00 · 2025-05-17 13:43:54 +08:00 · 32c1e38207
commit 32c1e38207
parent 8c0ccf9a6b 642cd2839b
11 changed files with 89 additions and 33 deletions
--- a/opencompass/partitioners/num_worker.py
+++ b/opencompass/partitioners/num_worker.py
@ -141,7 +141,8 @@ class NumWorkerPartitioner(BasePartitioner):
        dataset = build_dataset_from_cfg(dataset)
        self.dataset_size[dataset_abbr] = len(dataset.test)

-        mmengine.mkdir_or_exist('.cache/')
+        tmp_dir_root = osp.dirname(self.dataset_size_path)
+        mmengine.mkdir_or_exist(tmp_dir_root)
        mmengine.dump(self.dataset_size,
                      self.dataset_size_path,
                      indent=4,
--- a/opencompass/partitioners/size.py
+++ b/opencompass/partitioners/size.py
@ -214,7 +214,8 @@ class SizePartitioner(BasePartitioner):
        dataset = build_dataset_from_cfg(dataset)
        self.dataset_size[dataset_abbr] = len(dataset.test)

-        mmengine.mkdir_or_exist('.cache/')
+        tmp_dir_root = osp.dirname(self.dataset_size_path)
+        mmengine.mkdir_or_exist(tmp_dir_root)
        mmengine.dump(self.dataset_size,
                      self.dataset_size_path,
                      indent=4,
--- a/opencompass/partitioners/sub_num_worker.py
+++ b/opencompass/partitioners/sub_num_worker.py
@ -198,7 +198,8 @@ class SubjectiveNumWorkerPartitioner(SubjectiveNaivePartitioner):
        dataset = build_dataset_from_cfg(dataset)
        self.dataset_size[dataset_abbr] = len(dataset.test)

-        mmengine.mkdir_or_exist('.cache/')
+        tmp_dir_root = osp.dirname(self.dataset_size_path)
+        mmengine.mkdir_or_exist(tmp_dir_root)
        mmengine.dump(self.dataset_size,
                      self.dataset_size_path,
                      indent=4,
--- a/opencompass/partitioners/sub_size.py
+++ b/opencompass/partitioners/sub_size.py
@ -274,7 +274,8 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner):
        dataset = build_dataset_from_cfg(dataset)
        self.dataset_size[dataset_abbr] = len(dataset.test)

-        mmengine.mkdir_or_exist('.cache/')
+        tmp_dir_root = osp.dirname(self.dataset_size_path)
+        mmengine.mkdir_or_exist(tmp_dir_root)
        mmengine.dump(self.dataset_size,
                      self.dataset_size_path,
                      indent=4,
--- a/opencompass/runners/base.py
+++ b/opencompass/runners/base.py
@ -15,18 +15,21 @@ class BaseRunner:
        task (ConfigDict): Task type config.
        debug (bool): Whether to run in debug mode.
        lark_bot_url (str): Lark bot url.
+        tmp_dir (str): The directory to store temporary files.
    """

    def __init__(self,
                 task: ConfigDict,
                 debug: bool = False,
-                 lark_bot_url: str = None):
+                 lark_bot_url: str = None,
+                 tmp_dir: str = 'tmp'):
        self.task_cfg = Config(task)
        self.debug = debug
        if lark_bot_url:
            self.lark_reporter = LarkReporter(lark_bot_url)
        else:
            self.lark_reporter = None
+        self.tmp_dir = tmp_dir

    def __call__(self, tasks: List[Dict[str, Any]]):
        """Launch multiple tasks and summarize the results.
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@ -33,6 +33,9 @@ class DLCRunner(BaseRunner):
        retry (int): Number of retries when job failed. Default: 2.
        debug (bool): Whether to run in debug mode. Default: False.
        lark_bot_url (str): Lark bot url. Default: None.
+        keep_tmp_file (bool): Whether to keep the temporary file.
+            Default: True.
+        tmp_dir (str): The directory to store temporary files. Default: 'tmp'.
    """

    def __init__(
@ -45,8 +48,12 @@ class DLCRunner(BaseRunner):
        debug: bool = False,
        lark_bot_url: str = None,
        keep_tmp_file: bool = True,
+        tmp_dir: str = 'tmp',
    ):
-        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
+        super().__init__(task=task,
+                         debug=debug,
+                         lark_bot_url=lark_bot_url,
+                         tmp_dir=tmp_dir)
        self.aliyun_cfg = aliyun_cfg
        self.max_num_workers = max_num_workers
        self.retry = retry
@ -114,12 +121,13 @@ class DLCRunner(BaseRunner):
                    break

        # Dump task config to file
-        mmengine.mkdir_or_exist('tmp/')
+        mmengine.mkdir_or_exist(self.tmp_dir)
        # Using uuid to avoid filename conflict
        import uuid

        uuid_str = str(uuid.uuid4())
-        param_file = f'tmp/{uuid_str}_params.py'
+        param_file = f'{uuid_str}_params.py'
+        param_file = osp.join(self.tmp_dir, param_file)
        pwd = os.getcwd()
        try:
            cfg.dump(param_file)
--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@ -48,6 +48,10 @@ class LocalRunner(BaseRunner):
            Defaults to 1.
        debug (bool): Whether to run in debug mode.
        lark_bot_url (str): Lark bot url.
+        keep_tmp_file (bool): Whether to keep the temporary file. Defaults to
+            False.
+        tmp_dir (str): The directory to store temporary files.
+            Defaults to 'tmp'.
    """

    def __init__(self,
@ -57,8 +61,12 @@ class LocalRunner(BaseRunner):
                 max_workers_per_gpu: int = 1,
                 lark_bot_url: str = None,
                 keep_tmp_file: bool = False,
+                 tmp_dir: str = 'tmp',
                 **kwargs):
-        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
+        super().__init__(task=task,
+                         debug=debug,
+                         lark_bot_url=lark_bot_url,
+                         tmp_dir=tmp_dir)
        self.max_num_workers = max_num_workers
        self.max_workers_per_gpu = max_workers_per_gpu
        self.keep_tmp_file = keep_tmp_file
@ -101,11 +109,12 @@ class LocalRunner(BaseRunner):
                num_gpus = task.num_gpus
                assert len(all_gpu_ids) >= num_gpus
                # get cmd
-                mmengine.mkdir_or_exist('tmp/')
+                mmengine.mkdir_or_exist(self.tmp_dir)
                import uuid
                uuid_str = str(uuid.uuid4())

-                param_file = f'tmp/{uuid_str}_params.py'
+                param_file = f'{uuid_str}_params.py'
+                param_file = osp.join(self.tmp_dir, param_file)
                try:
                    task.cfg.dump(param_file)
                    # if use torchrun, restrict it behaves the same as non
@ -135,7 +144,8 @@ class LocalRunner(BaseRunner):
                        else:
                            task.run()
                    else:
-                        tmp_logs = f'tmp/{os.getpid()}_debug.log'
+                        tmp_logs = f'{os.getpid()}_debug.log'
+                        tmp_logs = osp.join(self.tmp_dir, tmp_logs)
                        get_logger().warning(
                            f'Debug mode, log will be saved to {tmp_logs}')
                        with open(tmp_logs, 'a') as log_file:
@ -207,14 +217,13 @@ class LocalRunner(BaseRunner):

        task_name = task.name

-        pwd = os.getcwd()
        # Dump task config to file
-        mmengine.mkdir_or_exist('tmp/')
+        mmengine.mkdir_or_exist(self.tmp_dir)
        # Using uuid to avoid filename conflict
        import uuid
        uuid_str = str(uuid.uuid4())
-        param_file = f'{pwd}/tmp/{uuid_str}_params.py'
-
+        param_file = f'{uuid_str}_params.py'
+        param_file = osp.join(self.tmp_dir, param_file)
        try:
            task.cfg.dump(param_file)
            tmpl = get_command_template(gpu_ids)
--- a/opencompass/runners/local_api.py
+++ b/opencompass/runners/local_api.py
@ -161,6 +161,8 @@ class LocalAPIRunner(BaseRunner):
            Defaults to 16.
        debug (bool): Whether to run in debug mode.
        lark_bot_url (str): Lark bot url.
+        tmp_dir (str): The directory to store temporary files.
+            Defaults to 'tmp'.
    """

    def __init__(self,
@ -168,8 +170,12 @@ class LocalAPIRunner(BaseRunner):
                 concurrent_users: int,
                 max_num_workers: int = 16,
                 debug: bool = False,
-                 lark_bot_url: str = None):
-        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
+                 lark_bot_url: str = None,
+                 tmp_dir: str = 'tmp'):
+        super().__init__(task=task,
+                         debug=debug,
+                         lark_bot_url=lark_bot_url,
+                         tmp_dir=tmp_dir)
        self.max_num_workers = max_num_workers
        self.concurrent_users = concurrent_users
        assert task['type'] in [
@ -194,8 +200,9 @@ class LocalAPIRunner(BaseRunner):
                task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))
                task_name = task.name
                # get cmd
-                mmengine.mkdir_or_exist('tmp/')
-                param_file = f'tmp/{os.getpid()}_params.py'
+                mmengine.mkdir_or_exist(self.tmp_dir)
+                param_file = f'{os.getpid()}_params.py'
+                param_file = osp.join(self.tmp_dir, param_file)
                try:
                    task.cfg.dump(param_file)
                    cmd = task.get_command(cfg_path=param_file,
--- a/opencompass/runners/slurm.py
+++ b/opencompass/runners/slurm.py
@ -33,6 +33,8 @@ class SlurmRunner(BaseRunner):
        lark_bot_url (str): Lark bot url. Defaults to None.
        extra_command (List, optional): Extra slurm command.
            For example ['-c 12', '-w node1']. Defaults to None.
+        tmp_dir (str): The directory to store temporary files.
+            Defaults to 'tmp'.
    """

    def __init__(self,
@ -44,8 +46,12 @@ class SlurmRunner(BaseRunner):
                 qos: str = None,
                 debug: bool = False,
                 lark_bot_url: str = None,
-                 extra_command: Optional[List[str]] = None):
-        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
+                 extra_command: Optional[List[str]] = None,
+                 tmp_dir: str = 'tmp'):
+        super().__init__(task=task,
+                         debug=debug,
+                         lark_bot_url=lark_bot_url,
+                         tmp_dir=tmp_dir)
        self.max_num_workers = max_num_workers
        self.retry = retry
        self.partition = partition
@ -93,8 +99,9 @@ class SlurmRunner(BaseRunner):
        task_name = task.name

        # Dump task config to file
-        mmengine.mkdir_or_exist('tmp/')
-        param_file = f'tmp/{os.getpid()}_params.py'
+        mmengine.mkdir_or_exist(self.tmp_dir)
+        param_file = f'{os.getpid()}_params.py'
+        param_file = osp.join(self.tmp_dir, param_file)
        try:
            cfg.dump(param_file)

--- a/opencompass/runners/slurm_sequential.py
+++ b/opencompass/runners/slurm_sequential.py
@ -47,6 +47,10 @@ class SlurmSequentialRunner(BaseRunner):
        lark_bot_url (str): Lark bot url. Defaults to None.
        extra_command (List, optional): Extra slurm command.
            For example ['-c 12', '-w node1']. Defaults to None.
+        keep_tmp_file (bool): Whether to keep the temporary file. Defaults to
+            False.
+        tmp_dir (str): The directory to store temporary files.
+            Defaults to 'tmp'.
    """

    def __init__(self,
@ -60,8 +64,12 @@ class SlurmSequentialRunner(BaseRunner):
                 debug: bool = False,
                 lark_bot_url: str = None,
                 extra_command: Optional[List[str]] = None,
-                 keep_tmp_file: bool = False):
-        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
+                 keep_tmp_file: bool = False,
+                 tmp_dir: str = 'tmp'):
+        super().__init__(task=task,
+                         debug=debug,
+                         lark_bot_url=lark_bot_url,
+                         tmp_dir=tmp_dir)
        self.max_num_workers = max_num_workers
        self.retry = retry
        self.partition = partition
@ -172,11 +180,12 @@ class SlurmSequentialRunner(BaseRunner):
        task_name = self.task_prefix + task_name

        # Dump task config to file
-        mmengine.mkdir_or_exist('tmp/')
+        mmengine.mkdir_or_exist(self.tmp_dir)
        # Using uuid to avoid filename conflict
        import uuid
        uuid_str = str(uuid.uuid4())
-        param_file = f'tmp/{uuid_str}_params.py'
+        param_file = f'{uuid_str}_params.py'
+        param_file = osp.join(self.tmp_dir, param_file)
        process = None
        try:
            cfg.dump(param_file)
--- a/opencompass/runners/volc.py
+++ b/opencompass/runners/volc.py
@ -36,6 +36,9 @@ class VOLCRunner(BaseRunner):
        retry (int): Number of retries when job failed. Default: 2.
        debug (bool): Whether to run in debug mode. Default: False.
        lark_bot_url (str): Lark bot url. Default: None.
+        keep_tmp_file (bool): Whether to keep the temporary file. Default:
+            False.
+        tmp_dir (str): The directory to store temporary files. Default: 'tmp'.
    """

    def __init__(self,
@ -48,8 +51,12 @@ class VOLCRunner(BaseRunner):
                 retry: int = 2,
                 debug: bool = False,
                 lark_bot_url: str = None,
-                 keep_tmp_file: bool = True):
-        super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
+                 keep_tmp_file: bool = True,
+                 tmp_dir: str = 'tmp'):
+        super().__init__(task=task,
+                         debug=debug,
+                         lark_bot_url=lark_bot_url,
+                         tmp_dir=tmp_dir)
        self.volcano_cfg = volcano_cfg
        self.max_num_workers = max_num_workers
        self.retry = retry
@ -101,13 +108,15 @@ class VOLCRunner(BaseRunner):
        # Build up VCC command
        pwd = os.getcwd()
        # Dump task config to file
-        mmengine.mkdir_or_exist('tmp/')
+        mmengine.mkdir_or_exist(self.tmp_dir)
        # Using uuid to avoid filename conflict
        import uuid
        uuid_str = str(uuid.uuid4())
-        param_file = f'{pwd}/tmp/{uuid_str}_params.py'
+        param_file = f'{uuid_str}_params.py'
+        param_file = osp.join(self.tmp_dir, param_file)

-        volc_cfg_file = f'{pwd}/tmp/{uuid_str}_cfg.yaml'
+        volc_cfg_file = f'{uuid_str}_cfg.yaml'
+        volc_cfg_file = osp.join(self.tmp_dir, volc_cfg_file)
        volc_cfg = self._choose_flavor(num_gpus)
        with open(volc_cfg_file, 'w') as fp:
            yaml.dump(volc_cfg, fp, sort_keys=False)