Enhance run.py (#7)

* Enhance run.py * update
2025-05-30 16:03:24 +08:00 · 2023-07-06 11:58:37 +08:00 · 2023-07-06 11:58:37 +08:00 · cd1bec5f2a
commit cd1bec5f2a
parent 5c19c8c5fc
2 changed files with 104 additions and 207 deletions
--- a/run.py
+++ b/run.py
@ -7,6 +7,7 @@ from datetime import datetime
 from mmengine.config import Config
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.registry import PARTITIONERS, RUNNERS
 from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
 from opencompass.utils import LarkReporter, Summarizer, get_logger
@ -14,18 +15,19 @@ from opencompass.utils import LarkReporter, Summarizer, get_logger
 def parse_args():
    parser = argparse.ArgumentParser(description='Run an evaluation task')
    parser.add_argument('config', help='Train config file path')
-    # add mutually exclusive args `--slurm` `--dlc`, default to local runner
+    # add mutually exclusive args `--slurm` `--dlc`, defaults to local runner
-    luach_method = parser.add_mutually_exclusive_group()
+    # if "infer" or "eval" not specified
-    luach_method.add_argument('--slurm',
+    launch_method = parser.add_mutually_exclusive_group()
    launch_method.add_argument('--slurm',
                               action='store_true',
                               default=False,
-                              help='Whether to use srun to launch tasks, if '
+                               help='Whether to force tasks to run with srun. '
-                              'True, `--partition(-p)` must be set. Defaults'
+                               'If True, `--partition(-p)` must be set. '
-                              ' to False')
+                               'Defaults to False')
-    luach_method.add_argument('--dlc',
+    launch_method.add_argument('--dlc',
                               action='store_true',
                               default=False,
-                              help='Whether to use dlc to launch tasks, if '
+                               help='Whether to force tasks to run on dlc. If '
                               'True, `--aliyun-cfg` must be set. Defaults'
                               ' to False')
    # add general args
@ -56,10 +58,11 @@ def parse_args():
                        'also be a specific timestamp, e.g. 20230516_144254'),
    parser.add_argument('-w',
                        '--work-dir',
-                        help='Work path, all the outputs will be saved in '
+                        help='Work path, all the outputs will be '
-                        'this path, including the slurm logs, the evaluation'
+                        'saved in this path, including the slurm logs, '
-                        ' results, the summary results, etc. If not specified,'
+                        'the evaluation results, the summary results, etc.'
-                        ' the work_dir will be set to None',
+                        'If not specified, the work_dir will be set to '
                        './outputs/default.',
                        default=None,
                        type=str)
    parser.add_argument('-l',
@ -68,21 +71,26 @@ def parse_args():
                        action='store_true',
                        default=False)
    parser.add_argument('--max-partition-size',
-                        help='The maximum size of a task.',
+                        help='The maximum size of an infer task. Only '
                        'effective when "infer" is missing from the config.',
                        type=int,
                        default=2000),
    parser.add_argument(
        '--gen-task-coef',
-        help='The dataset cost measurement coefficient for generation tasks',
+        help='The dataset cost measurement coefficient for generation tasks, '
        'Only effective when "infer" is missing from the config.',
        type=int,
        default=20)
    parser.add_argument('--max-num-workers',
-                        help='Max number of workers to run in parallel.',
+                        help='Max number of workers to run in parallel. '
                        'Will be overrideen by the "max_num_workers" argument '
                        'in the config.',
                        type=int,
                        default=32)
    parser.add_argument(
        '--retry',
-        help='Number of retries if the job failed when using slurm or dlc.',
+        help='Number of retries if the job failed when using slurm or dlc. '
        'Will be overrideen by the "retry" argument in the config.',
        type=int,
        default=2)
    # set srun args
@ -97,14 +105,14 @@ def parse_args():
            '--partition(-p) must be set if you want to use slurm')
    if args.dlc:
        assert os.path.exists(args.aliyun_cfg), (
-            'When luaching tasks using dlc, it needs to be configured'
+            'When launching tasks using dlc, it needs to be configured '
            'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"'
            ' to specify a new path.')
    return args
 def parse_slurm_args(slurm_parser):
-    """these args are all for slurm launch."""
+    """These args are all for slurm launch."""
    slurm_parser.add_argument('-p',
                              '--partition',
                              help='Slurm partition name',
@ -113,12 +121,12 @@ def parse_slurm_args(slurm_parser):
    slurm_parser.add_argument('-q',
                              '--quotatype',
                              help='Slurm quota type',
-                              default='auto',
+                              default=None,
                              type=str)
 def parse_dlc_args(dlc_parser):
-    """these args are all for dlc launch."""
+    """These args are all for dlc launch."""
    dlc_parser.add_argument('--aliyun-cfg',
                            help='The config path for aliyun config',
                            default='~/.aliyun.cfg',
@ -171,22 +179,71 @@ def main():
        LarkReporter(cfg['lark_bot_url']).post(content)
    if args.mode in ['all', 'infer']:
        if (args.dlc or args.slurm) and cfg.get('infer', None):
            logger.warning('You have set "infer" in the config, but '
                           'also specified --slurm or --dlc. '
                           'The "infer" configuration will be overridden by '
                           'your runtime arguments.')
        if args.dlc or args.slurm or cfg.get('infer', None) is None:
            # Use SizePartitioner to split into subtasks
-        partitioner = SizePartitioner(osp.join(cfg['work_dir'],
+            partitioner = SizePartitioner(
-                                               'predictions/'),
+                osp.join(cfg['work_dir'], 'predictions/'),
                max_task_size=args.max_partition_size,
                gen_task_coef=args.gen_task_coef)
            tasks = partitioner(cfg)
            # execute the infer subtasks
            exec_infer_runner(tasks, args, cfg)
        else:
            if args.partition is not None:
                if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
                    cfg.infer.runner.partition = args.partition
                    cfg.infer.runner.quotatype = args.quotatype
            else:
                logger.warning('SlurmRunner is not used, so the partition '
                               'argument is ignored.')
            if args.debug:
                cfg.infer.runner.debug = True
            if args.lark:
                cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
            cfg.infer.partitioner['out_dir'] = osp.join(
                cfg['work_dir'], 'predictions/')
            partitioner = PARTITIONERS.build(cfg.infer.partitioner)
            tasks = partitioner(cfg)
            runner = RUNNERS.build(cfg.infer.runner)
            runner(tasks)
    # evaluate
    if args.mode in ['all', 'eval']:
        if (args.dlc or args.slurm) and cfg.get('eval', None):
            logger.warning('You have set "eval" in the config, but '
                           'also specified --slurm or --dlc. '
                           'The "eval" configuration will be overridden by '
                           'your runtime arguments.')
        if args.dlc or args.slurm or cfg.get('eval', None) is None:
            # Use NaivePartitioner，not split
-        partitioner = NaivePartitioner(osp.join(cfg['work_dir'], 'results/'))
+            partitioner = NaivePartitioner(
                osp.join(cfg['work_dir'], 'results/'))
            tasks = partitioner(cfg)
            # execute the eval tasks
            exec_eval_runner(tasks, args, cfg)
        else:
            if args.partition is not None:
                if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
                    cfg.eval.runner.partition = args.partition
                    cfg.eval.runner.quotatype = args.quotatype
                else:
                    logger.warning('SlurmRunner is not used, so the partition '
                                   'argument is ignored.')
            if args.debug:
                cfg.eval.runner.debug = True
            if args.lark:
                cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
            cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'],
                                                       'results/')
            partitioner = PARTITIONERS.build(cfg.eval.partitioner)
            tasks = partitioner(cfg)
            runner = RUNNERS.build(cfg.eval.runner)
            runner(tasks)
    # visualize
    if args.mode in ['all', 'eval', 'viz']:
@ -212,8 +269,7 @@ def exec_infer_runner(tasks, args, cfg):
                           debug=args.debug,
                           lark_bot_url=cfg['lark_bot_url'])
    else:
-        runner = LocalRunner(
+        runner = LocalRunner(task=dict(type='OpenICLInferTask'),
            task=dict(type='OpenICLInferTask'),
                             max_num_workers=args.max_num_workers,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
@ -238,8 +294,7 @@ def exec_eval_runner(tasks, args, cfg):
                           debug=args.debug,
                           lark_bot_url=cfg['lark_bot_url'])
    else:
-        runner = LocalRunner(
+        runner = LocalRunner(task=dict(type='OpenICLEvalTask'),
            task=dict(type='OpenICLEvalTask'),
                             max_num_workers=args.max_num_workers,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
--- a/tools/cfg_run.py
+++ b/tools/cfg_run.py
@ -1,158 +0,0 @@
 import argparse
 import getpass
 import os
 import os.path as osp
 from datetime import datetime
 from mmengine.config import Config
 from opencompass.registry import PARTITIONERS, RUNNERS
 from opencompass.runners import SlurmRunner
 from opencompass.utils import LarkReporter, Summarizer, get_logger
 def parse_args():
    parser = argparse.ArgumentParser(description='Run an evaluation task')
    parser.add_argument('config', help='Train config file path')
    parser.add_argument('-p',
                        '--partition',
                        help='Slurm partition name',
                        default=None,
                        type=str)
    parser.add_argument('-q',
                        '--quotatype',
                        help='Slurm quota type',
                        default='auto',
                        type=str)
    parser.add_argument('--debug',
                        help='Debug mode, in which scheduler will run tasks '
                        'in the single process, and output will not be '
                        'redirected to files',
                        action='store_true',
                        default=False)
    parser.add_argument('-m',
                        '--mode',
                        help='Running mode. You can choose "infer" if you '
                        'only want the inference results, or "eval" if you '
                        'already have the results and want to evaluate them, '
                        'or "viz" if you want to visualize the results.',
                        choices=['all', 'infer', 'eval', 'viz'],
                        default='all',
                        type=str)
    parser.add_argument('-r',
                        '--reuse',
                        nargs='?',
                        type=str,
                        const='latest',
                        help='Reuse previous outputs & results, and run any '
                        'missing jobs presented in the config. If its '
                        'argument is not specified, the latest results in '
                        'the work_dir will be reused. The argument should '
                        'also be a specific timestamp, e.g. 20230516_144254'),
    parser.add_argument('-w',
                        '--work-dir',
                        help='Work path, all the outputs will be '
                        'saved in this path, including the slurm logs, '
                        'the evaluation results, the summary results, etc.'
                        'If not specified, the work_dir will be set to '
                        './outputs/default.',
                        default=None,
                        type=str)
    parser.add_argument('-l',
                        '--lark',
                        help='Report the running status to lark bot',
                        action='store_true',
                        default=False)
    args = parser.parse_args()
    return args
 def main():
    args = parse_args()
    # initialize logger
    logger = get_logger(log_level='DEBUG' if args.debug else 'INFO')
    cfg = Config.fromfile(args.config)
    if args.work_dir is not None:
        cfg['work_dir'] = args.work_dir
    else:
        cfg.setdefault('work_dir', './outputs/default/')
    # cfg_time_str defaults to the current time
    cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
    if args.reuse:
        if args.reuse == 'latest':
            dirs = os.listdir(cfg.work_dir)
            assert len(dirs) > 0, 'No previous results to reuse!'
            dir_time_str = sorted(dirs)[-1]
        else:
            dir_time_str = args.reuse
        logger.info(f'Reusing experiements from {dir_time_str}')
    elif args.mode in ['eval', 'viz']:
        raise ValueError('You must specify -r or --reuse when running in eval '
                         'or viz mode!')
    # update "actual" work_dir
    cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
    os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True)
    # dump config
    output_config_path = osp.join(cfg.work_dir, 'configs',
                                  f'{cfg_time_str}.py')
    cfg.dump(output_config_path)
    # Config is intentally reloaded here to avoid initialized
    # types cannot be serialized
    cfg = Config.fromfile(output_config_path)
    # infer
    if not args.lark:
        cfg['lark_bot_url'] = None
    elif cfg.get('lark_bot_url', None):
        content = f'{getpass.getuser()} 的新任务已启动！'
        LarkReporter(cfg['lark_bot_url']).post(content)
    if cfg.get('infer', None) is not None and args.mode in ['all', 'infer']:
        if args.partition is not None:
            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
                cfg.infer.runner.partition = args.partition
                cfg.infer.runner.quotatype = args.quotatype
            else:
                logger.warning('SlurmRunner is not used, so the partition '
                               'argument is ignored.')
        if args.debug:
            cfg.infer.runner.debug = True
        if args.lark:
            cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
        cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
                                                    'predictions/')
        partitioner = PARTITIONERS.build(cfg.infer.partitioner)
        tasks = partitioner(cfg)
        runner = RUNNERS.build(cfg.infer.runner)
        runner(tasks)
    # evaluate
    if cfg.get('eval', None) is not None and args.mode in ['all', 'eval']:
        if args.partition is not None:
            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
                cfg.eval.runner.partition = args.partition
                cfg.eval.runner.quotatype = args.quotatype
            else:
                logger.warning('SlurmRunner is not used, so the partition '
                               'argument is ignored.')
        if args.debug:
            cfg.eval.runner.debug = True
        if args.lark:
            cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
        cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
        partitioner = PARTITIONERS.build(cfg.eval.partitioner)
        tasks = partitioner(cfg)
        runner = RUNNERS.build(cfg.eval.runner)
        runner(tasks)
    # visualize
    if args.mode in ['all', 'eval', 'viz']:
        summarizer = Summarizer(cfg)
        summarizer.summarize(time_str=cfg_time_str)
 if __name__ == '__main__':
    main()