[Feature] Add JudgeLLMs (#710)

* add judgellms * add judgellms * add sub_size_partition * add docs * add ref
2025-05-30 16:03:24 +08:00 · 2023-12-19 18:40:25 +08:00 · 2023-12-19 18:40:25 +08:00 · 97c2068bd9
commit 97c2068bd9
parent eda72e756e
14 changed files with 650 additions and 13 deletions
--- a/configs/eval_subjective_judge_pandalm.py
+++ b/configs/eval_subjective_judge_pandalm.py
@ -0,0 +1,84 @@
 from mmengine.config import read_base
 with read_base():
    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
    from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
    from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
    from .datasets.subjective_cmp.alignment_bench import subjective_datasets
 datasets = [*subjective_datasets]
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
 from opencompass.partitioners import NaivePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.runners import SlurmSequentialRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import AlignmentBenchSummarizer
 # -------------Inferen Stage ----------------------------------------
 models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]
 infer = dict(
    partitioner=dict(type=NaivePartitioner),
    runner=dict(
        type=SlurmSequentialRunner,
        partition='llmeval',
        quotatype='auto',
        max_num_workers=256,
        task=dict(type=OpenICLInferTask)),
 )
 # -------------Evalation Stage ----------------------------------------
 ## ------------- JudgeLLM Configuration
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ]
 )
 judge_model = dict(
        type=HuggingFaceCausalLM,
        abbr='pandalm-7b-v1-hf',
        path="WeOpenML/PandaLM-7B-v1",
        tokenizer_path='WeOpenML/PandaLM-7B-v1',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              trust_remote_code=True,
                              use_fast=False,),
        max_out_len=512,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto', trust_remote_code=True),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ## ------------- Evaluation Configuration
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
        mode='singlescore',
        models = [*hf_baichuan2_7b]
    ),
    runner=dict(
        type=LocalRunner,
        max_num_workers=2,
        task=dict(
            type=SubjectiveEvalTask,
            judge_cfg=judge_model
        )),
 )
 summarizer = dict(
    type=AlignmentBenchSummarizer,
 )
 work_dir = 'outputs/pandalm'
--- a/configs/models/judge_llm/auto_j/hf_autoj_bilingual_6b.py
+++ b/configs/models/judge_llm/auto_j/hf_autoj_bilingual_6b.py
@ -0,0 +1,26 @@
 from opencompass.models import HuggingFaceCausalLM
 '''
 This is a bilingual 6B version of Auto-J. 
 It is trained on both the original training data 
 and its Chinese translation, which can be find in 
 https://huggingface.co/GAIR/autoj-bilingual-6b
 '''
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='autoj-bilingual-6b',
        path="GAIR/autoj-bilingual-6b",
        tokenizer_path='GAIR/autoj-bilingual-6b',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              trust_remote_code=True,
                              use_fast=False,),
        max_out_len=512,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto', trust_remote_code=True),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/judge_llm/auto_j/hf_autoj_eng_13b.py
+++ b/configs/models/judge_llm/auto_j/hf_autoj_eng_13b.py
@ -0,0 +1,20 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='autoj-13b-GPTQ-4bits',
        path="GAIR/autoj-13b-GPTQ-4bits",
        tokenizer_path='GAIR/autoj-13b-GPTQ-4bits',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              trust_remote_code=True,
                              use_fast=False,),
        max_out_len=512,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto', trust_remote_code=True),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/judge_llm/auto_j/hf_autoj_eng_13b_4bit.py
+++ b/configs/models/judge_llm/auto_j/hf_autoj_eng_13b_4bit.py
@ -0,0 +1,25 @@
 from opencompass.models import HuggingFaceCausalLM
 '''
 #This is a 4bits quantized version of Auto-J by using AutoGPTQ, 
 which is available on huggingface-hub: 
 https://huggingface.co/GAIR/autoj-13b-GPTQ-4bits
 '''
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='autoj-13b',
        path="GAIR/autoj-13b",
        tokenizer_path='GAIR/autoj-13b',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              trust_remote_code=True,
                              use_fast=False,),
        max_out_len=512,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto', trust_remote_code=True),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/judge_llm/auto_j/hf_autoj_scen_classifier.py
+++ b/configs/models/judge_llm/auto_j/hf_autoj_scen_classifier.py
@ -0,0 +1,20 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='autoj-scenario-classifier',
        path="GAIR/autoj-scenario-classifier",
        tokenizer_path='GAIR/autoj-scenario-classifier',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              trust_remote_code=True,
                              use_fast=False,),
        max_out_len=512,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto', trust_remote_code=True),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/judge_llm/judgelm/hf_judgelm_13b_v1.py
+++ b/configs/models/judge_llm/judgelm/hf_judgelm_13b_v1.py
@ -0,0 +1,20 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='judgelm-13b-v1-hf',
        path="BAAI/JudgeLM-13b-v1.0",
        tokenizer_path='BAAI/JudgeLM-13b-v1.0',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              trust_remote_code=True,
                              use_fast=False,),
        max_out_len=512,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto', trust_remote_code=True),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/judge_llm/judgelm/hf_judgelm_33b_v1.py
+++ b/configs/models/judge_llm/judgelm/hf_judgelm_33b_v1.py
@ -0,0 +1,20 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='judgelm-33b-v1-hf',
        path="BAAI/JudgeLM-33b-v1.0",
        tokenizer_path='BAAI/JudgeLM-33b-v1.0',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              trust_remote_code=True,
                              use_fast=False,),
        max_out_len=512,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto', trust_remote_code=True),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/judge_llm/judgelm/hf_judgelm_7b_v1.py
+++ b/configs/models/judge_llm/judgelm/hf_judgelm_7b_v1.py
@ -0,0 +1,20 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='judgelm-7b-v1-hf',
        path="BAAI/JudgeLM-7B-v1.0",
        tokenizer_path='BAAI/JudgeLM-7B-v1.0',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              trust_remote_code=True,
                              use_fast=False,),
        max_out_len=512,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto', trust_remote_code=True),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/judge_llm/pandalm/hf_alpaca_pandalm_7b_v1.py
+++ b/configs/models/judge_llm/pandalm/hf_alpaca_pandalm_7b_v1.py
@ -0,0 +1,20 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='alpaca-pandalm-7b-v1-hf',
        path="WeOpenML/PandaLM-Alpaca-7B-v1",
        tokenizer_path='WeOpenML/PandaLM-Alpaca-7B-v1',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              trust_remote_code=True,
                              use_fast=False,),
        max_out_len=512,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto', trust_remote_code=True),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/judge_llm/pandalm/hf_pandalm_7b_v1.py
+++ b/configs/models/judge_llm/pandalm/hf_pandalm_7b_v1.py
@ -0,0 +1,20 @@
 from opencompass.models import HuggingFaceCausalLM
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='pandalm-7b-v1-hf',
        path="WeOpenML/PandaLM-7B-v1",
        tokenizer_path='WeOpenML/PandaLM-7B-v1',
        tokenizer_kwargs=dict(padding_side='left',
                              truncation_side='left',
                              trust_remote_code=True,
                              use_fast=False,),
        max_out_len=512,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(device_map='auto', trust_remote_code=True),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/docs/en/advanced_guides/subjective_evaluation.md
+++ b/docs/en/advanced_guides/subjective_evaluation.md
@ -144,6 +144,64 @@ The `-r` parameter allows the reuse of model inference and GPT-4 evaluation resu
 The response of JudgeLLM will be output to `output/.../results/timestamp/xxmodel/xxdataset/.json`.
 The evaluation report will be output to `output/.../summary/timestamp/report.csv`.
 Opencompass has supported lots of JudgeLLM, actually, you can take any model as JudgeLLM in opencompass configs.
 And we list the popular open-source JudgeLLM here:
 1. Auto-J, refer to `configs/models/judge_llm/auto_j`
 Consider cite the following paper if you find it helpful:
 ```bibtex
@article{li2023generative,
  title={Generative judge for evaluating alignment},
  author={Li, Junlong and Sun, Shichao and Yuan, Weizhe and Fan, Run-Ze and Zhao, Hai and Liu, Pengfei},
  journal={arXiv preprint arXiv:2310.05470},
  year={2023}
 }
@misc{2023opencompass,
    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
    author={OpenCompass Contributors},
    howpublished = {\url{https://github.com/open-compass/opencompass}},
    year={2023}
 }
 ```
 2. JudgeLM, refer to `configs/models/judge_llm/judgelm`
 ```bibtex
@article{zhu2023judgelm,
  title={JudgeLM: Fine-tuned Large Language Models are Scalable Judges},
  author={Zhu, Lianghui and Wang, Xinggang and Wang, Xinlong},
  journal={arXiv preprint arXiv:2310.17631},
  year={2023}
 }
@misc{2023opencompass,
    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
    author={OpenCompass Contributors},
    howpublished = {\url{https://github.com/open-compass/opencompass}},
    year={2023}
 }
 ```
 3. PandaLM, refer to `configs/models/judge_llm/pandalm`
 Consider cite the following paper if you find it helpful:
 ```bibtex
@article{wang2023pandalm,
  title={PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization},
  author={Wang, Yidong and Yu, Zhuohao and Zeng, Zhengran and Yang, Linyi and Wang, Cunxiang and Chen, Hao and Jiang, Chaoya and Xie, Rui and Wang, Jindong and Xie, Xing and others},
  journal={arXiv preprint arXiv:2306.05087},
  year={2023}
 }
@misc{2023opencompass,
    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
    author={OpenCompass Contributors},
    howpublished = {\url{https://github.com/open-compass/opencompass}},
    year={2023}
 }
 ```
 ## Practice: AlignBench Evaluation
 ### Dataset
--- a/docs/zh_cn/advanced_guides/subjective_evaluation.md
+++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md
@ -142,6 +142,66 @@ python run.py configs/eval_subjective_score.py -r
 JudgeLLM的评测回复会保存在 `output/.../results/timestamp/xxmodel/xxdataset/.json`
 评测报告则会输出到 `output/.../summary/timestamp/report.csv`。
 Opencompass 已经支持了很多的JudgeLLM，实际上，你可以将Opencompass中所支持的所有模型都当作JudgeLLM使用。
 我们列出目前比较流行的开源JudgeLLM：
 1. Auto-J，请参考 `configs/models/judge_llm/auto_j`
 如果使用了该方法，请添加引用:
 ```bibtex
@article{li2023generative,
  title={Generative judge for evaluating alignment},
  author={Li, Junlong and Sun, Shichao and Yuan, Weizhe and Fan, Run-Ze and Zhao, Hai and Liu, Pengfei},
  journal={arXiv preprint arXiv:2310.05470},
  year={2023}
 }
@misc{2023opencompass,
    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
    author={OpenCompass Contributors},
    howpublished = {\url{https://github.com/open-compass/opencompass}},
    year={2023}
 }
 ```
 2. JudgeLM，请参考 `configs/models/judge_llm/judgelm`
 如果使用了该方法，请添加引用:
 ```bibtex
@article{zhu2023judgelm,
  title={JudgeLM: Fine-tuned Large Language Models are Scalable Judges},
  author={Zhu, Lianghui and Wang, Xinggang and Wang, Xinlong},
  journal={arXiv preprint arXiv:2310.17631},
  year={2023}
 }
@misc{2023opencompass,
    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
    author={OpenCompass Contributors},
    howpublished = {\url{https://github.com/open-compass/opencompass}},
    year={2023}
 }
 ```
 3. PandaLM，请参考 `configs/models/judge_llm/pandalm`
 如果使用了该方法，请添加引用:
 ```bibtex
@article{wang2023pandalm,
  title={PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization},
  author={Wang, Yidong and Yu, Zhuohao and Zeng, Zhengran and Yang, Linyi and Wang, Cunxiang and Chen, Hao and Jiang, Chaoya and Xie, Rui and Wang, Jindong and Xie, Xing and others},
  journal={arXiv preprint arXiv:2306.05087},
  year={2023}
 }
@misc{2023opencompass,
    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
    author={OpenCompass Contributors},
    howpublished = {\url{https://github.com/open-compass/opencompass}},
    year={2023}
 }
 ```
 ## 实战：AlignBench 主观评测
 ### 数据集准备
--- a/opencompass/partitioners/sub_naive.py
+++ b/opencompass/partitioners/sub_naive.py
@ -8,18 +8,6 @@ from opencompass.registry import PARTITIONERS
 from .naive import NaivePartitioner
 def remove_duplicate_pairs(model_combinations):
    combo_dict = {}
    for i, combo in enumerate(model_combinations):
        sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr'])))
        if sorted_names not in combo_dict:
            combo_dict[sorted_names] = i
    new_model_combinations = [
        model_combinations[i] for i in combo_dict.values()
    ]
    return new_model_combinations
@PARTITIONERS.register_module()
 class SubjectiveNaivePartitioner(NaivePartitioner):
    """Naive task partitioner for subjective evaluation. Compared to
@ -47,6 +35,17 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
        self.compare_models = compare_models
        self.model_pairs = model_pairs
    def remove_duplicate_pairs(self, model_combinations):
        combo_dict = {}
        for i, combo in enumerate(model_combinations):
            sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr'])))
            if sorted_names not in combo_dict:
                combo_dict[sorted_names] = i
        new_model_combinations = [
            model_combinations[i] for i in combo_dict.values()
        ]
        return new_model_combinations
    def get_model_combinations(
            self,
            models: List[ConfigDict],
@ -58,7 +57,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
        elif self.mode == 'm2n':
            assert len(base_models) > 0 and len(compare_models) > 0
            model_combinations = list(product(base_models, compare_models))
-            unique_combinations = remove_duplicate_pairs([
+            unique_combinations = self.remove_duplicate_pairs([
                combo for combo in model_combinations if combo[0] != combo[1]
            ])
            return unique_combinations
--- a/opencompass/partitioners/sub_size.py
+++ b/opencompass/partitioners/sub_size.py
@ -0,0 +1,245 @@
 import copy
 import math
 import os.path as osp
 from fnmatch import fnmatch
 from typing import Dict, List, Optional, Tuple, Union
 import mmengine
 from mmengine.config import Config, ConfigDict
 from opencompass.registry import PARTITIONERS
 from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
                               get_infer_output_path)
 from .sub_naive import SubjectiveNaivePartitioner
@PARTITIONERS.register_module()
 class SubjectiveSizePartitioner(SubjectiveNaivePartitioner):
    """Task partitioner based on the size of the dataset (with some rough
    expansion as an estimation of computational cost).
    Args:
        out_dir (str): The output directory of tasks.
        max_task_size (int): The maximum size of a task.
        gen_task_coef (int): The dataset cost measurement coefficient for
            generation tasks.
        strategy (str): The partition strategy. Supported strategies are:
            'heuristic' and 'split'. Defaults to 'heuristic'.
            heuristic: split large datasets into several tasks, merge small
                datasets into one task.
            split: split large datasets into several tasks only.
        dataset_size_path (str): The path to the dataset size cache file.
        keep_keys (list[str]): The keys to be kept from the experiment config
            to the task config.
    """
    def __init__(self,
                 mode: str,
                 out_dir: str,
                 models: Optional[List[ConfigDict]] = [],
                 base_models: Optional[List[ConfigDict]] = [],
                 compare_models: Optional[List[ConfigDict]] = [],
                 model_pairs: Optional[List[Tuple]] = None,
                 max_task_size: int = 40000,
                 gen_task_coef: int = 20,
                 strategy: str = 'heuristic',
                 dataset_size_path: str = '.cache/dataset_size.json',
                 keep_keys: Optional[List[str]] = None):
        super().__init__(out_dir=out_dir,
                         keep_keys=keep_keys,
                         mode=mode,
                         models=models,
                         base_models=base_models,
                         compare_models=compare_models,
                         model_pairs=model_pairs)
        self.max_task_size = max_task_size
        self.gen_task_coef = gen_task_coef
        self.dataset_size_path = dataset_size_path
        assert strategy in ('heuristic', 'split'), \
            f'Unsupported partition strategy: {strategy}. '\
            'Supported strategies are: `heuristic`, `split` .'
        self.strategy = strategy
    def partition(self,
                  models: List[ConfigDict],
                  datasets: List[ConfigDict],
                  work_dir: str,
                  out_dir: str,
                  add_cfg: Dict = {}) -> List[ConfigDict]:
        """Partition model-dataset pairs into tasks. Each task is defined as a
        dict and will run independently as a unit. Its structure is as
        follows:
        .. code-block:: python
            {
                'models': [],  # a list of model configs
                'datasets': [[]],  # a nested list of dataset configs, each
                                    list corresponds to a model
                'work_dir': '',  # the work dir
                **add_cfg  # other keys to be kept in the config
            }
        Args:
            models (List[ConfigDict]): A list of model configs.
            datasets (List[ConfigDict]): A list of dataset configs.
            work_dir (str): The work dir for the task.
            out_dir (str): The full output path for the task, intended for
                Partitioners to check whether the task is finished via the
                existency of result file in this directory.
            add_cfg (dict): Other common keys to be added in the task config,
                used to share the same config among tasks. Defaults to {}.
        Returns:
            List[ConfigDict]: A list of tasks.
        """
        models = self.models if self.models != [] else models
        base_models, compare_models = self.base_models, self.compare_models
        if self.mode == 'singlescore':
            models = models
        else:
            models = super().get_model_combinations(models, base_models,
                                                    compare_models)
        model_dataset_combinations = [{'models': models, 'datasets': datasets}]
        tasks = []
        for comb in model_dataset_combinations:
            comb['datasets'] = sorted(comb['datasets'],
                                      key=lambda x: self.get_cost(x),
                                      reverse=True)
            for model in comb['models']:
                chunks = []  # elements: tuple(size, dataset_chunk)
                for dataset in comb['datasets']:
                    filename = get_infer_output_path(model, dataset, out_dir)
                    # skip the task if the task output exists
                    if osp.exists(filename):
                        continue
                    dataset_size = self.get_cost(dataset)
                    if dataset_size > self.max_task_size:
                        root, ext = osp.splitext(filename)
                        dataset_splits = self.split_dataset(dataset)
                        for i, dataset_split in enumerate(dataset_splits):
                            if not osp.exists(f'{root}_{i}{ext}'):
                                chunks.append(
                                    (self.max_task_size, dataset_split))
                    else:
                        chunks.append((dataset_size, dataset))
                if self.strategy == 'heuristic':
                    chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
                    current_size, current_chunks = 0, []
                    for index in range(len(chunks)):
                        current_size += chunks[index][0]
                        current_chunks.append(chunks[index][1])
                        if index == len(chunks) - 1 or current_size + chunks[
                                index + 1][0] > self.max_task_size:
                            tasks.append(
                                Config({
                                    'models': [model],
                                    'datasets': [current_chunks],
                                    'work_dir': work_dir,
                                    **add_cfg
                                }))
                            current_size, current_chunks = 0, []
                elif self.strategy == 'split':
                    for _, dataset in chunks:
                        tasks.append(
                            Config({
                                'models': [model],
                                'datasets': [[dataset]],
                                'work_dir': work_dir,
                                **add_cfg
                            }))
        return tasks
    @property
    def dataset_size(self):
        if not hasattr(self, '_dataset_size'):
            if osp.exists(self.dataset_size_path):
                self._dataset_size = mmengine.load(self.dataset_size_path)
            else:
                self._dataset_size = {}
        return self._dataset_size
    def split_dataset(self, dataset_cfg: ConfigDict) -> List[ConfigDict]:
        """Split dataset into several parts."""
        dataset_size, num_repeats = self.get_cost(dataset_cfg,
                                                  get_raw_factors=True)
        split_configs = []
        abbr = dataset_abbr_from_cfg(dataset_cfg)
        step = self.max_task_size // num_repeats
        # evenly distribute the task
        step = math.ceil(dataset_size / math.ceil(dataset_size / step))
        for part, i in enumerate(range(0, dataset_size, step)):
            cfg = copy.deepcopy(dataset_cfg)
            cfg['abbr'] = abbr + f'_{part}'
            test_range = cfg['reader_cfg'].get('test_range', '')
            cfg['reader_cfg']['test_range'] = f'{test_range}[{i}:{i+step}]'
            split_configs.append(cfg)
        return split_configs
    def get_factor(self, dataset: ConfigDict) -> int:
        infer_cfg = dataset.infer_cfg
        template = (infer_cfg.prompt_template.template if 'prompt_template'
                    in infer_cfg else infer_cfg.ice_template.template)
        # If it's the Gen template, the dataset size will be multiplied by the
        # self.gen_task_coef
        factor = self.gen_task_coef
        # If it's the PPL template, the dataset size will be multiplied by the
        # number of labels
        if isinstance(template, dict):
            ctr = sum(key in template for key in ('begin', 'round', 'end'))
            if ctr != len(template.keys()):
                factor = len(template.keys())
        dataset_abbr = dataset_abbr_from_cfg(dataset)
        if any(
                fnmatch(dataset_abbr, pattern)
                for pattern in ('bbh*', 'gsm8k*', 'math*', 'strategyqa*',
                                'agieval-jec*', 'agieval-gaokao-mathcloze',
                                'agieval-math', '*professional_law')):
            factor *= 10
        return factor
    def get_cost(self,
                 dataset: ConfigDict,
                 get_raw_factors: bool = False) -> Union[int, Tuple[int, int]]:
        """Get the computational cost of inferring on the dataset.
        Args:
            dataset (ConfigDict): The dataset config.
            get_raw_factors (bool): If True, the raw factors of computational
                cost will be returned.
        Returns:
            int or Tuple[int, int]: The size of the dataset. If get_raw_factors
                is True, the number of repeats will also be returned.
        """
        dataset_abbr = dataset_abbr_from_cfg(dataset)
        test_range = dataset.reader_cfg.get('test_range', '')
        factor = self.get_factor(dataset)
        if dataset_abbr in self.dataset_size:
            actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
                               f'{test_range})')
            if get_raw_factors:
                return actual_size, factor
            return factor * actual_size
        dataset = build_dataset_from_cfg(dataset)
        self.dataset_size[dataset_abbr] = len(dataset.test)
        mmengine.mkdir_or_exist('.cache/')
        mmengine.dump(self.dataset_size,
                      self.dataset_size_path,
                      indent=4,
                      ensure_ascii=False)
        actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
                           f'{test_range})')
        if get_raw_factors:
            return actual_size, factor
        return factor * actual_size