From 97c2068bd9b21ac2b30177db6531554f4695bc51 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Tue, 19 Dec 2023 18:40:25 +0800 Subject: [PATCH] [Feature] Add JudgeLLMs (#710) * add judgellms * add judgellms * add sub_size_partition * add docs * add ref --- configs/eval_subjective_judge_pandalm.py | 84 ++++++ .../judge_llm/auto_j/hf_autoj_bilingual_6b.py | 26 ++ .../judge_llm/auto_j/hf_autoj_eng_13b.py | 20 ++ .../judge_llm/auto_j/hf_autoj_eng_13b_4bit.py | 25 ++ .../auto_j/hf_autoj_scen_classifier.py | 20 ++ .../judge_llm/judgelm/hf_judgelm_13b_v1.py | 20 ++ .../judge_llm/judgelm/hf_judgelm_33b_v1.py | 20 ++ .../judge_llm/judgelm/hf_judgelm_7b_v1.py | 20 ++ .../pandalm/hf_alpaca_pandalm_7b_v1.py | 20 ++ .../judge_llm/pandalm/hf_pandalm_7b_v1.py | 20 ++ .../advanced_guides/subjective_evaluation.md | 58 +++++ .../advanced_guides/subjective_evaluation.md | 60 +++++ opencompass/partitioners/sub_naive.py | 25 +- opencompass/partitioners/sub_size.py | 245 ++++++++++++++++++ 14 files changed, 650 insertions(+), 13 deletions(-) create mode 100644 configs/eval_subjective_judge_pandalm.py create mode 100644 configs/models/judge_llm/auto_j/hf_autoj_bilingual_6b.py create mode 100644 configs/models/judge_llm/auto_j/hf_autoj_eng_13b.py create mode 100644 configs/models/judge_llm/auto_j/hf_autoj_eng_13b_4bit.py create mode 100644 configs/models/judge_llm/auto_j/hf_autoj_scen_classifier.py create mode 100644 configs/models/judge_llm/judgelm/hf_judgelm_13b_v1.py create mode 100644 configs/models/judge_llm/judgelm/hf_judgelm_33b_v1.py create mode 100644 configs/models/judge_llm/judgelm/hf_judgelm_7b_v1.py create mode 100644 configs/models/judge_llm/pandalm/hf_alpaca_pandalm_7b_v1.py create mode 100644 configs/models/judge_llm/pandalm/hf_pandalm_7b_v1.py create mode 100644 opencompass/partitioners/sub_size.py diff --git a/configs/eval_subjective_judge_pandalm.py b/configs/eval_subjective_judge_pandalm.py new file mode 100644 index 00000000..41e2c526 --- /dev/null +++ b/configs/eval_subjective_judge_pandalm.py @@ -0,0 +1,84 @@ +from mmengine.config import read_base +with read_base(): + from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat + from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat + from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b + from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b + from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b + from .datasets.subjective_cmp.alignment_bench import subjective_datasets + +datasets = [*subjective_datasets] + +from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3 +from opencompass.partitioners import NaivePartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.runners import SlurmSequentialRunner +from opencompass.tasks import OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.summarizers import AlignmentBenchSummarizer + + +# -------------Inferen Stage ---------------------------------------- + +models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat] + +infer = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=SlurmSequentialRunner, + partition='llmeval', + quotatype='auto', + max_num_workers=256, + task=dict(type=OpenICLInferTask)), +) + + +# -------------Evalation Stage ---------------------------------------- + + +## ------------- JudgeLLM Configuration +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ] +) + +judge_model = dict( + type=HuggingFaceCausalLM, + abbr='pandalm-7b-v1-hf', + path="WeOpenML/PandaLM-7B-v1", + tokenizer_path='WeOpenML/PandaLM-7B-v1', + tokenizer_kwargs=dict(padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False,), + max_out_len=512, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) + +## ------------- Evaluation Configuration +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + mode='singlescore', + models = [*hf_baichuan2_7b] + ), + runner=dict( + type=LocalRunner, + max_num_workers=2, + task=dict( + type=SubjectiveEvalTask, + judge_cfg=judge_model + )), +) + +summarizer = dict( + type=AlignmentBenchSummarizer, +) + +work_dir = 'outputs/pandalm' diff --git a/configs/models/judge_llm/auto_j/hf_autoj_bilingual_6b.py b/configs/models/judge_llm/auto_j/hf_autoj_bilingual_6b.py new file mode 100644 index 00000000..f115b751 --- /dev/null +++ b/configs/models/judge_llm/auto_j/hf_autoj_bilingual_6b.py @@ -0,0 +1,26 @@ +from opencompass.models import HuggingFaceCausalLM + +''' +This is a bilingual 6B version of Auto-J. +It is trained on both the original training data +and its Chinese translation, which can be find in +https://huggingface.co/GAIR/autoj-bilingual-6b +''' + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='autoj-bilingual-6b', + path="GAIR/autoj-bilingual-6b", + tokenizer_path='GAIR/autoj-bilingual-6b', + tokenizer_kwargs=dict(padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False,), + max_out_len=512, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/models/judge_llm/auto_j/hf_autoj_eng_13b.py b/configs/models/judge_llm/auto_j/hf_autoj_eng_13b.py new file mode 100644 index 00000000..3252b3a5 --- /dev/null +++ b/configs/models/judge_llm/auto_j/hf_autoj_eng_13b.py @@ -0,0 +1,20 @@ +from opencompass.models import HuggingFaceCausalLM + + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='autoj-13b-GPTQ-4bits', + path="GAIR/autoj-13b-GPTQ-4bits", + tokenizer_path='GAIR/autoj-13b-GPTQ-4bits', + tokenizer_kwargs=dict(padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False,), + max_out_len=512, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/models/judge_llm/auto_j/hf_autoj_eng_13b_4bit.py b/configs/models/judge_llm/auto_j/hf_autoj_eng_13b_4bit.py new file mode 100644 index 00000000..97309109 --- /dev/null +++ b/configs/models/judge_llm/auto_j/hf_autoj_eng_13b_4bit.py @@ -0,0 +1,25 @@ +from opencompass.models import HuggingFaceCausalLM + +''' +#This is a 4bits quantized version of Auto-J by using AutoGPTQ, +which is available on huggingface-hub: +https://huggingface.co/GAIR/autoj-13b-GPTQ-4bits +''' + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='autoj-13b', + path="GAIR/autoj-13b", + tokenizer_path='GAIR/autoj-13b', + tokenizer_kwargs=dict(padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False,), + max_out_len=512, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/models/judge_llm/auto_j/hf_autoj_scen_classifier.py b/configs/models/judge_llm/auto_j/hf_autoj_scen_classifier.py new file mode 100644 index 00000000..2d0b1175 --- /dev/null +++ b/configs/models/judge_llm/auto_j/hf_autoj_scen_classifier.py @@ -0,0 +1,20 @@ +from opencompass.models import HuggingFaceCausalLM + + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='autoj-scenario-classifier', + path="GAIR/autoj-scenario-classifier", + tokenizer_path='GAIR/autoj-scenario-classifier', + tokenizer_kwargs=dict(padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False,), + max_out_len=512, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/models/judge_llm/judgelm/hf_judgelm_13b_v1.py b/configs/models/judge_llm/judgelm/hf_judgelm_13b_v1.py new file mode 100644 index 00000000..d3657db8 --- /dev/null +++ b/configs/models/judge_llm/judgelm/hf_judgelm_13b_v1.py @@ -0,0 +1,20 @@ +from opencompass.models import HuggingFaceCausalLM + + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='judgelm-13b-v1-hf', + path="BAAI/JudgeLM-13b-v1.0", + tokenizer_path='BAAI/JudgeLM-13b-v1.0', + tokenizer_kwargs=dict(padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False,), + max_out_len=512, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/models/judge_llm/judgelm/hf_judgelm_33b_v1.py b/configs/models/judge_llm/judgelm/hf_judgelm_33b_v1.py new file mode 100644 index 00000000..47722071 --- /dev/null +++ b/configs/models/judge_llm/judgelm/hf_judgelm_33b_v1.py @@ -0,0 +1,20 @@ +from opencompass.models import HuggingFaceCausalLM + + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='judgelm-33b-v1-hf', + path="BAAI/JudgeLM-33b-v1.0", + tokenizer_path='BAAI/JudgeLM-33b-v1.0', + tokenizer_kwargs=dict(padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False,), + max_out_len=512, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/models/judge_llm/judgelm/hf_judgelm_7b_v1.py b/configs/models/judge_llm/judgelm/hf_judgelm_7b_v1.py new file mode 100644 index 00000000..be59237b --- /dev/null +++ b/configs/models/judge_llm/judgelm/hf_judgelm_7b_v1.py @@ -0,0 +1,20 @@ +from opencompass.models import HuggingFaceCausalLM + + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='judgelm-7b-v1-hf', + path="BAAI/JudgeLM-7B-v1.0", + tokenizer_path='BAAI/JudgeLM-7B-v1.0', + tokenizer_kwargs=dict(padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False,), + max_out_len=512, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/models/judge_llm/pandalm/hf_alpaca_pandalm_7b_v1.py b/configs/models/judge_llm/pandalm/hf_alpaca_pandalm_7b_v1.py new file mode 100644 index 00000000..8e7ba0fd --- /dev/null +++ b/configs/models/judge_llm/pandalm/hf_alpaca_pandalm_7b_v1.py @@ -0,0 +1,20 @@ +from opencompass.models import HuggingFaceCausalLM + + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='alpaca-pandalm-7b-v1-hf', + path="WeOpenML/PandaLM-Alpaca-7B-v1", + tokenizer_path='WeOpenML/PandaLM-Alpaca-7B-v1', + tokenizer_kwargs=dict(padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False,), + max_out_len=512, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/models/judge_llm/pandalm/hf_pandalm_7b_v1.py b/configs/models/judge_llm/pandalm/hf_pandalm_7b_v1.py new file mode 100644 index 00000000..00926914 --- /dev/null +++ b/configs/models/judge_llm/pandalm/hf_pandalm_7b_v1.py @@ -0,0 +1,20 @@ +from opencompass.models import HuggingFaceCausalLM + + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='pandalm-7b-v1-hf', + path="WeOpenML/PandaLM-7B-v1", + tokenizer_path='WeOpenML/PandaLM-7B-v1', + tokenizer_kwargs=dict(padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False,), + max_out_len=512, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/docs/en/advanced_guides/subjective_evaluation.md b/docs/en/advanced_guides/subjective_evaluation.md index cf3e2c4f..9746a0fd 100644 --- a/docs/en/advanced_guides/subjective_evaluation.md +++ b/docs/en/advanced_guides/subjective_evaluation.md @@ -144,6 +144,64 @@ The `-r` parameter allows the reuse of model inference and GPT-4 evaluation resu The response of JudgeLLM will be output to `output/.../results/timestamp/xxmodel/xxdataset/.json`. The evaluation report will be output to `output/.../summary/timestamp/report.csv`. +Opencompass has supported lots of JudgeLLM, actually, you can take any model as JudgeLLM in opencompass configs. +And we list the popular open-source JudgeLLM here: + +1. Auto-J, refer to `configs/models/judge_llm/auto_j` + +Consider cite the following paper if you find it helpful: + +```bibtex +@article{li2023generative, + title={Generative judge for evaluating alignment}, + author={Li, Junlong and Sun, Shichao and Yuan, Weizhe and Fan, Run-Ze and Zhao, Hai and Liu, Pengfei}, + journal={arXiv preprint arXiv:2310.05470}, + year={2023} +} +@misc{2023opencompass, + title={OpenCompass: A Universal Evaluation Platform for Foundation Models}, + author={OpenCompass Contributors}, + howpublished = {\url{https://github.com/open-compass/opencompass}}, + year={2023} +} +``` + +2. JudgeLM, refer to `configs/models/judge_llm/judgelm` + +```bibtex +@article{zhu2023judgelm, + title={JudgeLM: Fine-tuned Large Language Models are Scalable Judges}, + author={Zhu, Lianghui and Wang, Xinggang and Wang, Xinlong}, + journal={arXiv preprint arXiv:2310.17631}, + year={2023} +} +@misc{2023opencompass, + title={OpenCompass: A Universal Evaluation Platform for Foundation Models}, + author={OpenCompass Contributors}, + howpublished = {\url{https://github.com/open-compass/opencompass}}, + year={2023} +} +``` + +3. PandaLM, refer to `configs/models/judge_llm/pandalm` + +Consider cite the following paper if you find it helpful: + +```bibtex +@article{wang2023pandalm, + title={PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization}, + author={Wang, Yidong and Yu, Zhuohao and Zeng, Zhengran and Yang, Linyi and Wang, Cunxiang and Chen, Hao and Jiang, Chaoya and Xie, Rui and Wang, Jindong and Xie, Xing and others}, + journal={arXiv preprint arXiv:2306.05087}, + year={2023} +} +@misc{2023opencompass, + title={OpenCompass: A Universal Evaluation Platform for Foundation Models}, + author={OpenCompass Contributors}, + howpublished = {\url{https://github.com/open-compass/opencompass}}, + year={2023} +} +``` + ## Practice: AlignBench Evaluation ### Dataset diff --git a/docs/zh_cn/advanced_guides/subjective_evaluation.md b/docs/zh_cn/advanced_guides/subjective_evaluation.md index 0035ece7..555c3f8e 100644 --- a/docs/zh_cn/advanced_guides/subjective_evaluation.md +++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md @@ -142,6 +142,66 @@ python run.py configs/eval_subjective_score.py -r JudgeLLM的评测回复会保存在 `output/.../results/timestamp/xxmodel/xxdataset/.json` 评测报告则会输出到 `output/.../summary/timestamp/report.csv`。 +Opencompass 已经支持了很多的JudgeLLM,实际上,你可以将Opencompass中所支持的所有模型都当作JudgeLLM使用。 +我们列出目前比较流行的开源JudgeLLM: + +1. Auto-J,请参考 `configs/models/judge_llm/auto_j` + +如果使用了该方法,请添加引用: + +```bibtex +@article{li2023generative, + title={Generative judge for evaluating alignment}, + author={Li, Junlong and Sun, Shichao and Yuan, Weizhe and Fan, Run-Ze and Zhao, Hai and Liu, Pengfei}, + journal={arXiv preprint arXiv:2310.05470}, + year={2023} +} +@misc{2023opencompass, + title={OpenCompass: A Universal Evaluation Platform for Foundation Models}, + author={OpenCompass Contributors}, + howpublished = {\url{https://github.com/open-compass/opencompass}}, + year={2023} +} +``` + +2. JudgeLM,请参考 `configs/models/judge_llm/judgelm` + +如果使用了该方法,请添加引用: + +```bibtex +@article{zhu2023judgelm, + title={JudgeLM: Fine-tuned Large Language Models are Scalable Judges}, + author={Zhu, Lianghui and Wang, Xinggang and Wang, Xinlong}, + journal={arXiv preprint arXiv:2310.17631}, + year={2023} +} +@misc{2023opencompass, + title={OpenCompass: A Universal Evaluation Platform for Foundation Models}, + author={OpenCompass Contributors}, + howpublished = {\url{https://github.com/open-compass/opencompass}}, + year={2023} +} +``` + +3. PandaLM,请参考 `configs/models/judge_llm/pandalm` + +如果使用了该方法,请添加引用: + +```bibtex +@article{wang2023pandalm, + title={PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization}, + author={Wang, Yidong and Yu, Zhuohao and Zeng, Zhengran and Yang, Linyi and Wang, Cunxiang and Chen, Hao and Jiang, Chaoya and Xie, Rui and Wang, Jindong and Xie, Xing and others}, + journal={arXiv preprint arXiv:2306.05087}, + year={2023} +} +@misc{2023opencompass, + title={OpenCompass: A Universal Evaluation Platform for Foundation Models}, + author={OpenCompass Contributors}, + howpublished = {\url{https://github.com/open-compass/opencompass}}, + year={2023} +} +``` + ## 实战:AlignBench 主观评测 ### 数据集准备 diff --git a/opencompass/partitioners/sub_naive.py b/opencompass/partitioners/sub_naive.py index e21193b0..5ae1e801 100644 --- a/opencompass/partitioners/sub_naive.py +++ b/opencompass/partitioners/sub_naive.py @@ -8,18 +8,6 @@ from opencompass.registry import PARTITIONERS from .naive import NaivePartitioner -def remove_duplicate_pairs(model_combinations): - combo_dict = {} - for i, combo in enumerate(model_combinations): - sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr']))) - if sorted_names not in combo_dict: - combo_dict[sorted_names] = i - new_model_combinations = [ - model_combinations[i] for i in combo_dict.values() - ] - return new_model_combinations - - @PARTITIONERS.register_module() class SubjectiveNaivePartitioner(NaivePartitioner): """Naive task partitioner for subjective evaluation. Compared to @@ -47,6 +35,17 @@ class SubjectiveNaivePartitioner(NaivePartitioner): self.compare_models = compare_models self.model_pairs = model_pairs + def remove_duplicate_pairs(self, model_combinations): + combo_dict = {} + for i, combo in enumerate(model_combinations): + sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr']))) + if sorted_names not in combo_dict: + combo_dict[sorted_names] = i + new_model_combinations = [ + model_combinations[i] for i in combo_dict.values() + ] + return new_model_combinations + def get_model_combinations( self, models: List[ConfigDict], @@ -58,7 +57,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner): elif self.mode == 'm2n': assert len(base_models) > 0 and len(compare_models) > 0 model_combinations = list(product(base_models, compare_models)) - unique_combinations = remove_duplicate_pairs([ + unique_combinations = self.remove_duplicate_pairs([ combo for combo in model_combinations if combo[0] != combo[1] ]) return unique_combinations diff --git a/opencompass/partitioners/sub_size.py b/opencompass/partitioners/sub_size.py new file mode 100644 index 00000000..624cef7b --- /dev/null +++ b/opencompass/partitioners/sub_size.py @@ -0,0 +1,245 @@ +import copy +import math +import os.path as osp +from fnmatch import fnmatch +from typing import Dict, List, Optional, Tuple, Union + +import mmengine +from mmengine.config import Config, ConfigDict + +from opencompass.registry import PARTITIONERS +from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg, + get_infer_output_path) + +from .sub_naive import SubjectiveNaivePartitioner + + +@PARTITIONERS.register_module() +class SubjectiveSizePartitioner(SubjectiveNaivePartitioner): + """Task partitioner based on the size of the dataset (with some rough + expansion as an estimation of computational cost). + + Args: + out_dir (str): The output directory of tasks. + max_task_size (int): The maximum size of a task. + gen_task_coef (int): The dataset cost measurement coefficient for + generation tasks. + strategy (str): The partition strategy. Supported strategies are: + 'heuristic' and 'split'. Defaults to 'heuristic'. + heuristic: split large datasets into several tasks, merge small + datasets into one task. + split: split large datasets into several tasks only. + dataset_size_path (str): The path to the dataset size cache file. + keep_keys (list[str]): The keys to be kept from the experiment config + to the task config. + """ + + def __init__(self, + mode: str, + out_dir: str, + models: Optional[List[ConfigDict]] = [], + base_models: Optional[List[ConfigDict]] = [], + compare_models: Optional[List[ConfigDict]] = [], + model_pairs: Optional[List[Tuple]] = None, + max_task_size: int = 40000, + gen_task_coef: int = 20, + strategy: str = 'heuristic', + dataset_size_path: str = '.cache/dataset_size.json', + keep_keys: Optional[List[str]] = None): + super().__init__(out_dir=out_dir, + keep_keys=keep_keys, + mode=mode, + models=models, + base_models=base_models, + compare_models=compare_models, + model_pairs=model_pairs) + self.max_task_size = max_task_size + self.gen_task_coef = gen_task_coef + self.dataset_size_path = dataset_size_path + assert strategy in ('heuristic', 'split'), \ + f'Unsupported partition strategy: {strategy}. '\ + 'Supported strategies are: `heuristic`, `split` .' + self.strategy = strategy + + def partition(self, + models: List[ConfigDict], + datasets: List[ConfigDict], + work_dir: str, + out_dir: str, + add_cfg: Dict = {}) -> List[ConfigDict]: + """Partition model-dataset pairs into tasks. Each task is defined as a + dict and will run independently as a unit. Its structure is as + follows: + + .. code-block:: python + + { + 'models': [], # a list of model configs + 'datasets': [[]], # a nested list of dataset configs, each + list corresponds to a model + 'work_dir': '', # the work dir + **add_cfg # other keys to be kept in the config + } + + Args: + models (List[ConfigDict]): A list of model configs. + datasets (List[ConfigDict]): A list of dataset configs. + work_dir (str): The work dir for the task. + out_dir (str): The full output path for the task, intended for + Partitioners to check whether the task is finished via the + existency of result file in this directory. + add_cfg (dict): Other common keys to be added in the task config, + used to share the same config among tasks. Defaults to {}. + + Returns: + List[ConfigDict]: A list of tasks. + """ + models = self.models if self.models != [] else models + base_models, compare_models = self.base_models, self.compare_models + if self.mode == 'singlescore': + models = models + else: + models = super().get_model_combinations(models, base_models, + compare_models) + model_dataset_combinations = [{'models': models, 'datasets': datasets}] + + tasks = [] + for comb in model_dataset_combinations: + comb['datasets'] = sorted(comb['datasets'], + key=lambda x: self.get_cost(x), + reverse=True) + for model in comb['models']: + chunks = [] # elements: tuple(size, dataset_chunk) + for dataset in comb['datasets']: + filename = get_infer_output_path(model, dataset, out_dir) + # skip the task if the task output exists + if osp.exists(filename): + continue + dataset_size = self.get_cost(dataset) + if dataset_size > self.max_task_size: + root, ext = osp.splitext(filename) + dataset_splits = self.split_dataset(dataset) + for i, dataset_split in enumerate(dataset_splits): + if not osp.exists(f'{root}_{i}{ext}'): + chunks.append( + (self.max_task_size, dataset_split)) + else: + chunks.append((dataset_size, dataset)) + + if self.strategy == 'heuristic': + chunks = sorted(chunks, key=lambda x: x[0], reverse=True) + current_size, current_chunks = 0, [] + for index in range(len(chunks)): + current_size += chunks[index][0] + current_chunks.append(chunks[index][1]) + if index == len(chunks) - 1 or current_size + chunks[ + index + 1][0] > self.max_task_size: + tasks.append( + Config({ + 'models': [model], + 'datasets': [current_chunks], + 'work_dir': work_dir, + **add_cfg + })) + current_size, current_chunks = 0, [] + elif self.strategy == 'split': + for _, dataset in chunks: + tasks.append( + Config({ + 'models': [model], + 'datasets': [[dataset]], + 'work_dir': work_dir, + **add_cfg + })) + return tasks + + @property + def dataset_size(self): + if not hasattr(self, '_dataset_size'): + if osp.exists(self.dataset_size_path): + self._dataset_size = mmengine.load(self.dataset_size_path) + else: + self._dataset_size = {} + return self._dataset_size + + def split_dataset(self, dataset_cfg: ConfigDict) -> List[ConfigDict]: + """Split dataset into several parts.""" + dataset_size, num_repeats = self.get_cost(dataset_cfg, + get_raw_factors=True) + split_configs = [] + abbr = dataset_abbr_from_cfg(dataset_cfg) + step = self.max_task_size // num_repeats + # evenly distribute the task + step = math.ceil(dataset_size / math.ceil(dataset_size / step)) + for part, i in enumerate(range(0, dataset_size, step)): + cfg = copy.deepcopy(dataset_cfg) + cfg['abbr'] = abbr + f'_{part}' + test_range = cfg['reader_cfg'].get('test_range', '') + cfg['reader_cfg']['test_range'] = f'{test_range}[{i}:{i+step}]' + split_configs.append(cfg) + return split_configs + + def get_factor(self, dataset: ConfigDict) -> int: + infer_cfg = dataset.infer_cfg + template = (infer_cfg.prompt_template.template if 'prompt_template' + in infer_cfg else infer_cfg.ice_template.template) + # If it's the Gen template, the dataset size will be multiplied by the + # self.gen_task_coef + factor = self.gen_task_coef + # If it's the PPL template, the dataset size will be multiplied by the + # number of labels + if isinstance(template, dict): + ctr = sum(key in template for key in ('begin', 'round', 'end')) + if ctr != len(template.keys()): + factor = len(template.keys()) + + dataset_abbr = dataset_abbr_from_cfg(dataset) + if any( + fnmatch(dataset_abbr, pattern) + for pattern in ('bbh*', 'gsm8k*', 'math*', 'strategyqa*', + 'agieval-jec*', 'agieval-gaokao-mathcloze', + 'agieval-math', '*professional_law')): + factor *= 10 + + return factor + + def get_cost(self, + dataset: ConfigDict, + get_raw_factors: bool = False) -> Union[int, Tuple[int, int]]: + """Get the computational cost of inferring on the dataset. + + Args: + dataset (ConfigDict): The dataset config. + get_raw_factors (bool): If True, the raw factors of computational + cost will be returned. + + Returns: + int or Tuple[int, int]: The size of the dataset. If get_raw_factors + is True, the number of repeats will also be returned. + """ + dataset_abbr = dataset_abbr_from_cfg(dataset) + + test_range = dataset.reader_cfg.get('test_range', '') + factor = self.get_factor(dataset) + + if dataset_abbr in self.dataset_size: + actual_size = eval('len(range(self.dataset_size[dataset_abbr])' + f'{test_range})') + if get_raw_factors: + return actual_size, factor + return factor * actual_size + + dataset = build_dataset_from_cfg(dataset) + self.dataset_size[dataset_abbr] = len(dataset.test) + + mmengine.mkdir_or_exist('.cache/') + mmengine.dump(self.dataset_size, + self.dataset_size_path, + indent=4, + ensure_ascii=False) + + actual_size = eval('len(range(self.dataset_size[dataset_abbr])' + f'{test_range})') + if get_raw_factors: + return actual_size, factor + return factor * actual_size