change max_task_size to dynamic

2025-05-30 16:03:24 +08:00 · 2024-05-22 11:28:09 +08:00 · 2024-05-22 11:28:09 +08:00 · db8d9a9798
commit db8d9a9798
parent 31afe87026
1 changed files with 19 additions and 0 deletions
--- a/opencompass/partitioners/size.py
+++ b/opencompass/partitioners/size.py
@ -91,6 +91,25 @@ class SizePartitioner(BasePartitioner):
                                      key=lambda x: self.get_cost(x),
                                      reverse=True)
            for model in comb['models']:
                # modified here, in order to maximize utilization of GPUs,
                # change the CONSTANT max_task_size parameter to DYNAMIC based on model configuration
                # if the model requires X gpu,
                # available gpu is Y
                # then there must be at least Y // X levels of parallelization, then max_task_size must produce at least Y // X tasks
                X = model['run_cfg']['num_gpus']
                Y = torch.cuda.device_count()
                min_num_parallel_tasks = Y // X
                if min_num_parallel_tasks > 1:
                    num_datasets = len(model_dataset_combinations[0]['datasets'])
                    total_estimated_size = num_datasets * \
                        min(
                            model['batch_size'],
                            sum([ele['num_repeats_per_file'] for ele in model_dataset_combinations[0]['datasets']]) //
                                len([ele['num_repeats_per_file'] for ele in model_dataset_combinations[0]['datasets']])
                        ) * \
                        self.gen_task_coef
                    self.max_task_size = total_estimated_size // min_num_parallel_tasks - 1
                chunks = []  # elements: tuple(size, dataset_chunk)
                for dataset in comb['datasets']:
                    filename = get_infer_output_path(model, dataset, out_dir)