[Feature] Add multi-model judge and fix some problems (#1016)

* support multi-model judge and moe judge * test_moe * test_moe * test * add moe judge * support multi-judge-model
2025-05-30 16:03:24 +08:00 · 2024-04-02 11:52:06 +08:00 · 2024-04-02 11:52:06 +08:00 · 2d4e559763
commit 2d4e559763
parent c220550fb9
22 changed files with 761 additions and 380 deletions
--- a/configs/datasets/subjective/alpaca_eval/alpacav1_judgeby_gpt4.py
+++ b/configs/datasets/subjective/alpaca_eval/alpacav1_judgeby_gpt4.py
@ -65,7 +65,6 @@ for _name in subjective_all_sets:
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            infer_order='random',
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
--- a/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
+++ b/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
@ -67,7 +67,6 @@ for _name in subjective_all_sets:
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            infer_order='random',
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
--- a/configs/datasets/subjective/compassarena/compassarena_compare.py
+++ b/configs/datasets/subjective/compassarena/compassarena_compare.py
@ -119,7 +119,6 @@ for _name, _prompt in sub_map.items():
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            infer_order='double',
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
--- a/configs/datasets/subjective/compassarena/compassarena_compare_moe.py
+++ b/configs/datasets/subjective/compassarena/compassarena_compare_moe.py
@ -0,0 +1,156 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import CompassArenaDataset
 subjective_reader_cfg = dict(
    input_columns=['question', 'ref'],
    output_column='judge',
    )
 data_path ="data/subjective/compass_arena"
 subjective_datasets = []
 base_prompt = """
 [回答1开始]
 {prediction}
 [回答1结束]
 [回答2开始]
 {prediction2}
 [回答2结束]
 根据评分要求，在以下 3 个选项中做出选择:
 A. 回答1更好
 B. 回答2更好
 C. 回答1、2平局
 并提供你的解释原因。
 如果你认为回答1更好，你的输出应形如：
 选择：A
 原因：blahblah blahblah\n
 如果你认为回答2更好，你的输出应形如：
 选择：B
 原因：blahblah blahblah\n
 如果你认为回答1、2打成平手，你的输出应形如：
 选择：C
 原因：blahblah blahblah\n
 """
 knowledge_prompt = """
 请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
 评分要求（重要性依次递减）:
 1. 更好的回答能与参考答案吻合或表明参考答案的意思。
 2. 在都准确答对问题的前提下，更好的回答能对知识点进行额外补充，且补充的知识准确无误。
 3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
 [用户问题]
 {question}
 [参考答案]
 {ref}
 """ + base_prompt
 language_prompt = """
 请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
 评分要求（重要性依次递减）:
 1. 在有明确的参考答案的情况下，越贴近参考答案或表明了参考答案的意思的回答越好。
 2. 更好的回答在语言表达上更流畅，更加符合与人类对话的习惯，包括语气、情调等
 3. 在都准确答对问题的前提下，更好的回答能进行额外补充，且补充的内容准确无误。
 [用户问题]
 {question}
 [参考答案]
 {ref}
 """ + base_prompt
 math_prompt = """
 请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
 评分要求（重要性依次递减）:
 1. 更好的回答的答案能和参考答案一致。
 2. 若两个回答的答案都与参考答案不一致，则更好的回答的推理过程应更加合理。
 3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
 [用户问题]
 {question}
 [参考答案]
 {ref}
 """ + base_prompt
 reason_prompt = math_prompt
 creation_prompt = """
 请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
 评分要求（重要性依次递减）:
 1. 好的回答必须首先符合用户问题里的各种需求，不能跑题 
 2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
 3. 好的回答必须具有创造性的词语和表达丰富度
 [用户问题]
 {question}
 """ + base_prompt
 sub_map = {"knowledge": knowledge_prompt, "language": language_prompt, "math_v2": math_prompt, "reason_v2": reason_prompt, "creationv2_zh": creation_prompt}
 meta_prompt = """
 \n你是一个评判专家，请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。\n评分要求（重要性依次递减）:\n1. 好的回答必须首先符合用户问题里的各种需求，不能跑题 \n2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答\n3. 好的回答必须具有创造性的词语和表达丰富度\n\n[用户问题]\n{question}\n[回答1开始]\n{prediction}\n[回答1结束]\n[回答2开始]\n{prediction2}\n[回答2结束]\n此外，还有两个其他评判专家的评判意见供你参考。\n[评判意见1]\n{judgement}\n[评判意见2]\n{judgement2}\n\n最终请你综合其他评判专家的评判意见与你自己的意见，在以下 3 个选项中做出选择:\nA. 回答1更好\nB. 回答2更好\nC. 回答1、2平局\n并提供你的解释原因。\n\n如果你认为回答1更好，你的输出应形如：\n选择：A\n原因：blahblah blahblah\n\n\n如果你认为回答2更好，你的输出应形如：\n选择：B\n原因：blahblah blahblah\n\n\n如果你认为回答1、2打成平手，你的输出应形如：\n选择：C\n原因：blahblah blahblah\n\n
 """
 for _name, _prompt in sub_map.items():
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt="{question}"
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
        )
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt = _prompt
                    ),
                ]),
            ),
            meta_review_prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt = meta_prompt
                    ),
                ]),
            ),
        ),
        pred_role="BOT",
    )
    subjective_datasets.append(
        dict(
            abbr=f"{_name}",
            type=CompassArenaDataset,
            path=data_path,
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg
        ))
--- a/configs/datasets/subjective/information_retrival/ir_judgedby_autoj.py
+++ b/configs/datasets/subjective/information_retrival/ir_judgedby_autoj.py
@ -1,71 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import IRDataset
 subjective_reader_cfg = dict(
    input_columns=['question', 'capability', 'ref'],
    output_column='judge',
    )
 subjective_all_sets = [
    "information_retrieval",
 ]
 data_path ="data/subjective/"
 subjective_datasets = []
 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt="{question}"
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
        )
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt = """为上传的针对给定用户问题的回应撰写评论, 并为该回复打分:
 [BEGIN DATA]
 ***
 [用户问询]: {question}
 ***
 [回应]: {prediction}
 ***
 [参考答案]: {ref}
 ***
 [END DATA]
 请根据参考答案为这个回应撰写评论. 在这之后, 你应该按照如下格式给这个回应一个最终的1-10范围的评分: "[[评分]]", 例如: "评分: [[5]]"."""
                    ),
                ]),
            ),
        ),
        pred_role="BOT",
    )
    subjective_datasets.append(
        dict(
            abbr=f"{_name}",
            type=IRDataset,
            path=data_path,
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg
        ))
--- a/configs/datasets/subjective/information_retrival/ir_judgedby_gpt4.py
+++ b/configs/datasets/subjective/information_retrival/ir_judgedby_gpt4.py
@ -1,59 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import IRDataset
 subjective_reader_cfg = dict(
    input_columns=['question', 'capability', 'gpt4_prefix', 'gpt4_suffix', 'ref'],
    output_column='judge',
    )
 subjective_all_sets = [
    "information_retrieval",
 ]
 data_path ="data/subjective/"
 subjective_datasets = []
 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt="{question}"
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
        )
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt = "{gpt4_prefix}{prediction}{gpt4_suffix}"
                    ),
                ]),
            ),
        ),
        pred_role="BOT",
    )
    subjective_datasets.append(
        dict(
            abbr=f"{_name}",
            type=IRDataset,
            path=data_path,
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg
        ))
--- a/configs/eval_subjective_alignbench.py
+++ b/configs/eval_subjective_alignbench.py
@ -44,7 +44,7 @@ models = [
        meta_template=api_meta_template,
        max_out_len=2048,
        max_seq_len=4096,
-        batch_size=1,
+        batch_size=8,
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
@ -54,7 +54,7 @@ datasets = [*subjective_datasets]
 # -------------Evalation Stage ----------------------------------------
 ## ------------- JudgeLLM Configuration
-judge_model = dict(
+judge_models = [dict(
    abbr='GPT4-Turbo',
    type=OpenAI,
    path='gpt-4-1106-preview',
@ -65,18 +65,14 @@ judge_model = dict(
    max_seq_len=2048,
    batch_size=8,
    temperature=0,
-)
+)]
 ## ------------- Evaluation Configuration
 eval = dict(
    partitioner=dict(
-        type=SubjectiveNaivePartitioner, mode='singlescore', models=models
+        type=SubjectiveSizePartitioner, max_task_size=1000, mode='singlescore', models=models, judge_models=judge_models,
    ),
    runner=dict(
        type=LocalRunner,
        max_num_workers=2,
        task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model),
    ),
    runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
 )
 summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
--- a/configs/eval_subjective_alpacaeval_oc.py
+++ b/configs/eval_subjective_alpacaeval_oc.py
@ -47,7 +47,7 @@ models = [
        meta_template=api_meta_template,
        max_out_len=2048,
        max_seq_len=4096,
-        batch_size=1,
+        batch_size=8,
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
@ -73,7 +73,7 @@ gpt4 = dict(
 # -------------Evalation Stage ----------------------------------------
 ## ------------- JudgeLLM Configuration
-judge_model = dict(
+judge_models = [dict(
    abbr='GPT4-Turbo',
    type=OpenAI,
    path='gpt-4-1106-preview',
@ -85,21 +85,20 @@ judge_model = dict(
    batch_size=2,
    retry=20,
    temperature=0,
-)
+)]
 ## ------------- Evaluation Configuration
 eval = dict(
    partitioner=dict(
-        type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models
+        type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models, 
-    ),
+        infer_order='random',
-    runner=dict(
+        judge_models=judge_models
        type=SlurmSequentialRunner,
        partition='llmeval',
        quotatype='auto',
        max_num_workers=256,
        task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model),
    ),
    runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
    given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
 )
 work_dir = 'outputs/alpaca/'
 summarizer = dict(type=AlpacaSummarizer, judge_type='v2')
--- a/configs/eval_subjective_compassarena.py
+++ b/configs/eval_subjective_compassarena.py
@ -72,7 +72,7 @@ gpt4 = dict(
 # -------------Evalation Stage ----------------------------------------
 ## ------------- JudgeLLM Configuration
-judge_model = dict(
+judge_models = [dict(
    abbr='GPT4-Turbo',
    type=OpenAI,
    path='gpt-4-1106-preview',
@ -84,7 +84,7 @@ judge_model = dict(
    batch_size=2,
    retry=20,
    temperature=0,
-)
+)]
 ## ------------- Evaluation Configuration
 eval = dict(
@ -93,16 +93,13 @@ eval = dict(
        strategy='split',
        max_task_size=10000,
        mode='m2n',
        infer_order='double',
        base_models=[gpt4],
        compare_models=models,
        judge_models=judge_models,
    ),
-    runner=dict(
+    runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
-        type=SlurmSequentialRunner,
+    given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
        partition='llm_dev2',
        quotatype='auto',
        max_num_workers=32,
        task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model),
    ),
 )
 work_dir = 'outputs/compass_arena_debug/'
--- a/configs/eval_subjective_judge_pandalm.py
+++ b/configs/eval_subjective_judge_pandalm.py
@ -63,7 +63,7 @@ infer = dict(
 # -------------Evalation Stage ----------------------------------------
 ## ------------- JudgeLLM Configuration
-judge_model = dict(
+judge_models = [dict(
    type=HuggingFaceCausalLM,
    abbr='pandalm-7b-v1-hf',
    path='WeOpenML/PandaLM-7B-v1',
@ -79,12 +79,12 @@ judge_model = dict(
    batch_size=8,
    model_kwargs=dict(device_map='auto', trust_remote_code=True),
    run_cfg=dict(num_gpus=1, num_procs=1),
-)
+)]
 ## ------------- Evaluation Configuration
 eval = dict(
-    partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models),
+    partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models, judge_models=judge_models),
-    runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
+    runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
 )
 summarizer = dict(type=AlignmentBenchSummarizer)
--- a/configs/eval_subjective_mtbench.py
+++ b/configs/eval_subjective_mtbench.py
@ -2,7 +2,6 @@ from mmengine.config import read_base
 with read_base():
    from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
    # from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.models.openai_api import OpenAIAllesAPIN
@ -62,7 +61,7 @@ datasets = [*subjective_datasets]
 # -------------Evalation Stage ----------------------------------------
 ## ------------- JudgeLLM Configuration
-judge_model = dict(
+judge_models = [dict(
    abbr='GPT4-Turbo',
    type=OpenAI,
    path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613
@ -73,23 +72,12 @@ judge_model = dict(
    max_seq_len=2048,
    batch_size=8,
    temperature=0,
-)
+)]
 ## ------------- Evaluation Configuration
 # ## pair evaluation
 # eval = dict(
 #     partitioner=dict(
 #         type=SubjectiveSizePartitioner, max_task_size=100, mode='m2n', base_models=[gpt4], compare_models=models
 #     ),
 #     runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
 # )
 # summarizer = dict(type=MTBenchSummarizer, judge_type='pair')
 ## single evaluation
 eval = dict(
-    partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models),
+    partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models),
-    runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
+    runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
 )
 summarizer = dict(type=MTBenchSummarizer, judge_type='single')
--- a/opencompass/openicl/icl_evaluator/lm_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py
@ -27,10 +27,12 @@ def extract_dicts(data):
    return predictions
-def order_preds_and_record_references(predictions,
+def order_preds_and_record_references(
-                                      references,
+    predictions,
-                                      infer_order,
+    references,
-                                      seed=2680):
+    infer_order,
    seed=666,
 ):
    """Order predictions based on args and recording regrading references.
    Args:
@ -85,17 +87,19 @@ class LMEvaluator:
        prompt_template: ConfigDict,
        judge_cfg: ConfigDict,
        output_path: str,
-        infer_order: Optional[str] = 'random',
+        meta_review_prompt_template: Optional[ConfigDict] = None,
        dataset_cfg: Optional[ConfigDict] = None,
        postprocessor: ConfigDict = dict(type=first_number_postprocess)
    ) -> None:
        assert infer_order in ['random', 'double']
        self.output_path = output_path
        out_dir, out_name = osp.split(output_path)
        if not out_dir:
            out_dir = './'
        self.prompt_tmpl = ICL_PROMPT_TEMPLATES.build(prompt_template)
        if meta_review_prompt_template is not None:
            self.meta_review_prompt_tmpl = ICL_PROMPT_TEMPLATES.build(
                meta_review_prompt_template)
        max_out_len = judge_cfg.get('max_out_len', None)
        batch_size = judge_cfg.get('batch_size', None)
@ -108,16 +112,20 @@ class LMEvaluator:
        self.postprocessor = get_type_from_cfg(postprocessor)
        self.logger = get_logger()
        self.dataset_cfg = dataset_cfg
        self.infer_order = infer_order
-    def score(self, predictions, references: Optional[List] = None) -> Dict:
+    def score(self,
              predictions,
              judgements: Optional[List] = None,
              references: Optional[List] = None,
              meta: Optional[bool] = False,
              infer_order: Optional[str] = 'random') -> Dict:
        dup_indices = []
        if type(predictions) == list:
            """Apply to multi-model comparison."""
            references = [{} for _ in range(len(predictions[0]['model_preds']))
                          ] if references is None else references
            predictions, references = order_preds_and_record_references(
-                predictions, references, self.infer_order)
+                predictions, references, infer_order)
            # calculate dupicated predictions numbers
            total_predictions_num = len(predictions[0])
@ -135,6 +143,9 @@ class LMEvaluator:
                          ] if references is None else references
            predictions = [predictions['model_preds']]
        # Due to the rarity of identical predictions, we have temporarily disabled the plagiarism detection feature.
        dup_indices = []
        if len(dup_indices) != 0:
            # remove dupicated predictions
            for index in sorted(dup_indices, reverse=True):
@ -149,6 +160,14 @@ class LMEvaluator:
            for i in range(len(predictions)):
                key = 'prediction' if i == 0 else f'prediction{i + 1}'
                pred_dict[key] = predictions[i]
            if judgements:
                for i in range(len(judgements)):
                    key = 'judgement' if i == 0 else f'judgement{i + 1}'
                    pred_dict[key] = judgements[i]['model_preds']
                    for j in range(len(references)):
                        references[j]['judge_model' +
                                      str(i + 1)] = judgements[i]['model_name']
        elif isinstance(
                predictions[0][0], list
        ):  #multi round for format like [[[{'round':1, 'user':'', 'assistant':''}, {'round':2, 'user':'', 'assistant':''}], [{'round':1, 'user':'', 'assistant':''}, {'round':2, 'user':'', 'assistant':''}]]]
@ -158,11 +177,13 @@ class LMEvaluator:
                    key = 'prediction' if i == 0 else f'prediction{i}'
                    key += '_r' + str(j + 1)
                    pred_dict[key] = multiround_predictions[j]
-
+            if judgements:
                raise NotImplementedError(
                    'Not applied meta-reivew judge on multi-round dataset')
        if self.dataset_cfg:
            dataset = build_dataset_from_cfg(self.dataset_cfg)
-            if self.infer_order == 'double':
+            if infer_order == 'double':
                new_ds = {
                    k: dataset.test[k] * 2
                    for k in dataset.test.column_names
@ -179,7 +200,6 @@ class LMEvaluator:
                print(
                    f'Among total {total_predictions_num} predictions, there are {len(dup_indices)} predictions totally same, which are removed!'
                )
            for k, v in pred_dict.items():
                dataset.reader.dataset['test'] = dataset.test.add_column(k, v)
                dataset.reader.input_columns.append(k)
@ -201,8 +221,13 @@ class LMEvaluator:
                                    **pred_dict)
        dataset.reader.output_column = 'reference'
        retriever = ZeroRetriever(dataset)
-        self.inferencer.inference(retriever=retriever,
+        if meta:
-                                  prompt_template=self.prompt_tmpl)
+            self.inferencer.inference(
                retriever=retriever,
                prompt_template=self.meta_review_prompt_tmpl)
        else:
            self.inferencer.inference(retriever=retriever,
                                      prompt_template=self.prompt_tmpl)
        output = mmengine.load(self.output_path)
        return self.postprocess(output)
--- a/opencompass/partitioners/base.py
+++ b/opencompass/partitioners/base.py
@ -1,3 +1,4 @@
 # flake8: noqa: E501
 import inspect
 from abc import abstractmethod
 from copy import deepcopy
@ -81,11 +82,21 @@ class BasePartitioner:
                               work_dir=work_dir,
                               out_dir=self.out_dir,
                               add_cfg=add_cfg)
-
+        if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
-        self.logger.info(f'Partitioned into {len(tasks)} tasks.')
+                tasks[0], list):
-        for i, task in enumerate(tasks):
+            self.logger.info(
-            self.logger.debug(f'Task {i}: {task_abbr_from_cfg(task)}')
+                f'Now we are in the subjective evluation! Partitioned into 2 stages and {len(tasks[0])} tasks in first stage, {len(tasks[1])} tasks in second stage.'
-
+            )
            cnt = 0
            for task_part in tasks:
                for task in task_part:
                    self.logger.debug(
                        f'Task {cnt}: {task_abbr_from_cfg(task)}')
                    cnt += 1
        else:
            self.logger.info(f'Partitioned into {len(tasks)} tasks.')
            for i, task in enumerate(tasks):
                self.logger.debug(f'Task {i}: {task_abbr_from_cfg(task)}')
        return tasks
    def parse_model_dataset_args(self, cfg: ConfigDict):
--- a/opencompass/partitioners/sub_naive.py
+++ b/opencompass/partitioners/sub_naive.py
@ -1,14 +1,20 @@
 # flake8: noqa: E501
 import copy
 import os.path as osp
 from itertools import combinations, product
 from typing import Dict, List, Optional, Tuple
 from mmengine.config import ConfigDict
 from opencompass.registry import PARTITIONERS
 from opencompass.utils import (deal_with_judge_model_abbr,
                               get_infer_output_path, model_abbr_from_cfg)
 from .naive import NaivePartitioner
 def remove_duplicate_pairs(model_combinations):
    # For compare mode, we need to remove redundant pairs first
    combo_dict = {}
    for i, combo in enumerate(model_combinations):
        sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr'])))
@ -20,6 +26,82 @@ def remove_duplicate_pairs(model_combinations):
    return new_model_combinations
 def replicate_tasks_with_judge_models(tasks, judge_models, meta_judge_model):
    # When all tasks are already partitioned, we add judge_models and meta_judge_model as additional args.
    if meta_judge_model:
        replicated_tasks = [[], []]
    else:
        replicated_tasks = []
    for task in tasks:
        replicated_task_dicts = [task.copy() for _ in range(len(judge_models))]
        for idx, replicated_task in enumerate(replicated_task_dicts):
            replicated_task['judge_model'] = judge_models[idx]
        if meta_judge_model:
            meta_task = task.copy()
            meta_task['meta_judge_model'] = meta_judge_model
            meta_task['judge_models'] = judge_models
            replicated_tasks[1].append(meta_task)
            replicated_tasks[0].extend(replicated_task_dicts)
        else:
            replicated_tasks.extend(replicated_task_dicts)
    return replicated_tasks
 def remove_already_tasks(tasks, work_dir, meta_judge_model):
    # Check and remove the already finished subjective evaluation tasks
    if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
            tasks[0], list):
        tasks_to_keep = [[], []]
        for i in range(2):
            for task in tasks[i]:
                temp_task = copy.deepcopy(task)
                to_delete_index = [
                ]  # To deal with the situation that the partition strategy is not split, which means that there will be a task contains multi dataset, and when we need to re-start, we need to remove the already done tasks.
                for idx, dataset in enumerate(task['datasets'][0]):
                    if i == 0:
                        filename = get_infer_output_path(
                            deal_with_judge_model_abbr(task['models'][0],
                                                       task['judge_model'],
                                                       False), dataset,
                            osp.join(work_dir, 'results'))
                    else:
                        filename = get_infer_output_path(
                            deal_with_judge_model_abbr(
                                task['models'][0], task['meta_judge_model'],
                                True), dataset, osp.join(work_dir, 'results'))
                    if osp.exists(filename):
                        to_delete_index.append(idx)
                temp_task['datasets'][0] = [
                    temp_task['datasets'][0][j]
                    for j in range(len(temp_task['datasets'][0]))
                    if j not in to_delete_index
                ]
                if len(temp_task['datasets'][0]) != 0:
                    tasks_to_keep[i].append(temp_task)
    else:
        tasks_to_keep = []
        for task in tasks:
            temp_task = copy.deepcopy(task)
            to_delete_index = [
            ]  # To deal with the situation that the partition strategy is not split, which means that there will be a task contains multi dataset, and when we need to re-start, we need to remove the already done tasks.
            for idx, dataset in enumerate(task['datasets'][0]):
                filename = get_infer_output_path(
                    deal_with_judge_model_abbr(task['models'][0],
                                               task['judge_model']), dataset,
                    osp.join(work_dir, 'results'))
                if osp.exists(filename):
                    to_delete_index.append(idx)
            # Remove the already done tasks
            temp_task['datasets'][0] = [
                temp_task['datasets'][0][j]
                for j in range(len(temp_task['datasets'][0]))
                if j not in to_delete_index
            ]
            if len(temp_task['datasets'][0]) != 0:
                tasks_to_keep.append(temp_task)
    return tasks_to_keep
@PARTITIONERS.register_module()
 class SubjectiveNaivePartitioner(NaivePartitioner):
    """Naive task partitioner for subjective evaluation. Compared to
@ -37,15 +119,22 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
                 models: Optional[List[ConfigDict]] = [],
                 base_models: Optional[List[ConfigDict]] = [],
                 compare_models: Optional[List[ConfigDict]] = [],
                 judge_models: Optional[List[ConfigDict]] = [],
                 meta_judge_model: Optional[ConfigDict] = None,
                 model_pairs: Optional[List[Tuple]] = None,
-                 keep_keys: Optional[List[str]] = None):
+                 keep_keys: Optional[List[str]] = None,
                 infer_order: Optional[str] = 'random'):
        super().__init__(out_dir=out_dir, keep_keys=keep_keys)
        assert mode in ['singlescore', 'allpair', 'm2n', 'fixed']
        assert infer_order in ['random', 'double']
        self.mode = mode
        self.models = models
        self.base_models = base_models
        self.compare_models = compare_models
        self.model_pairs = model_pairs
        self.judge_models = judge_models
        self.meta_judge_model = meta_judge_model
        self.infer_order = infer_order
    def get_model_combinations(
            self,
@ -97,14 +186,35 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
        """
        models = self.models if self.models != [] else models
        base_models, compare_models = self.base_models, self.compare_models
        judge_models, meta_judge_model = self.judge_models, self.meta_judge_model
        if self.mode == 'singlescore':
            models = models
        else:
            models = self.get_model_combinations(models, base_models,
                                                 compare_models)
        model_dataset_combinations = [{'models': models, 'datasets': datasets}]
-        return super().partition(
+        tasks = super().partition(
            model_dataset_combinations=model_dataset_combinations,
            work_dir=work_dir,
            out_dir=out_dir,
            add_cfg=add_cfg)
        # We need to add judge models and meta-judge-model as new tasks
        # When there is no meta-judge-model, we assign all judge models to each tasks
        # When there is a meta-judge-model, we add an additional task stage
        tasks = replicate_tasks_with_judge_models(tasks, judge_models,
                                                  meta_judge_model)
        # We also need to check and remove the already done tasks
        tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
        if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
                tasks[0], list):
            # Refer to meta review judge
            for task_stage in tasks:
                for task in task_stage:
                    task['infer_order'] = self.infer_order
        else:
            # Refer to just have review judge
            for task in tasks:
                task['infer_order'] = self.infer_order
        return tasks
--- a/opencompass/partitioners/sub_size.py
+++ b/opencompass/partitioners/sub_size.py
@ -1,3 +1,4 @@
 # flake8: noqa: E501
 import copy
 import math
 import os.path as osp
@ -11,7 +12,8 @@ from opencompass.registry import PARTITIONERS
 from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
                               get_infer_output_path)
-from .sub_naive import SubjectiveNaivePartitioner
+from .sub_naive import (SubjectiveNaivePartitioner, remove_already_tasks,
                        replicate_tasks_with_judge_models)
@PARTITIONERS.register_module()
@ -40,19 +42,25 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner):
                 models: Optional[List[ConfigDict]] = [],
                 base_models: Optional[List[ConfigDict]] = [],
                 compare_models: Optional[List[ConfigDict]] = [],
                 judge_models: Optional[List[ConfigDict]] = [],
                 meta_judge_model: Optional[ConfigDict] = None,
                 model_pairs: Optional[List[Tuple]] = None,
                 max_task_size: int = 40000,
                 gen_task_coef: int = 20,
                 strategy: str = 'heuristic',
                 dataset_size_path: str = '.cache/dataset_size.json',
-                 keep_keys: Optional[List[str]] = None):
+                 keep_keys: Optional[List[str]] = None,
                 infer_order: Optional[str] = 'random'):
        super().__init__(out_dir=out_dir,
                         keep_keys=keep_keys,
                         mode=mode,
                         models=models,
                         base_models=base_models,
                         compare_models=compare_models,
-                         model_pairs=model_pairs)
+                         judge_models=judge_models,
                         meta_judge_model=meta_judge_model,
                         model_pairs=model_pairs,
                         infer_order=infer_order)
        self.max_task_size = max_task_size
        self.gen_task_coef = gen_task_coef
        self.dataset_size_path = dataset_size_path
@ -96,13 +104,13 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner):
        """
        models = self.models if self.models != [] else models
        base_models, compare_models = self.base_models, self.compare_models
        judge_models, meta_judge_model = self.judge_models, self.meta_judge_model
        if self.mode == 'singlescore':
            models = models
        else:
            models = super().get_model_combinations(models, base_models,
                                                    compare_models)
        model_dataset_combinations = [{'models': models, 'datasets': datasets}]
        tasks = []
        for comb in model_dataset_combinations:
            comb['datasets'] = sorted(comb['datasets'],
@ -113,8 +121,8 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner):
                for dataset in comb['datasets']:
                    filename = get_infer_output_path(model, dataset, out_dir)
                    # skip the task if the task output exists
-                    if osp.exists(filename):
+                    # if osp.exists(filename):
-                        continue
+                    #   continue
                    dataset_size = self.get_cost(dataset)
                    if dataset_size > self.max_task_size:
                        root, ext = osp.splitext(filename)
@ -151,6 +159,21 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner):
                                'work_dir': work_dir,
                                **add_cfg
                            }))
        tasks = replicate_tasks_with_judge_models(tasks, judge_models,
                                                  meta_judge_model)
        tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
        if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
                tasks[0], list):
            # Refer to meta review judge
            for task_stage in tasks:
                for task in task_stage:
                    task['infer_order'] = self.infer_order
        else:
            # Refer to just have review judge
            for task in tasks:
                task['infer_order'] = self.infer_order
        return tasks
    @property
--- a/opencompass/summarizers/subjective/alignmentbench.py
+++ b/opencompass/summarizers/subjective/alignmentbench.py
@ -309,7 +309,7 @@ class AlignmentBenchSummarizer:
        self.eval_model_abbrs = [
            model_abbr_from_cfg(model) for model in self.eval_model_cfgs
        ]
-        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
+        self.judge_models = self.cfg.get('judge_models', None)
        self.judge_type = judge_type
        assert self.judge_type in [
            'general', 'autoj', 'judgelm', 'general_plus'
@ -333,33 +333,36 @@ class AlignmentBenchSummarizer:
        Returns:
            pd.DataFrame: The summary results.
        """
-        dataset_cfgs = self.cfg['datasets']
+        for judge_model in self.judge_models:
-        output_dir, results_folder = get_outdir(self.cfg, time_str)
+            judge_abbr = model_abbr_from_cfg(judge_model)
-        fout_flag, fout_flag2 = 0, 0
+            dataset_cfgs = self.cfg['datasets']
-        for eval_model_abbr in self.eval_model_abbrs:
+            output_dir, results_folder = get_outdir(self.cfg, time_str)
-            subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
+            fout_flag, fout_flag2 = 0, 0
-            subdir_path = os.path.join(results_folder, subdir)
+            for eval_model_abbr in self.eval_model_abbrs:
-            if os.path.isdir(subdir_path):
+                subdir = eval_model_abbr + '_judged-by--' + judge_abbr
-                model, judge_model = eval_model_abbr, self.judge_abbr
+                subdir_path = os.path.join(results_folder, subdir)
-                if self.judge_type == 'general':
+                if os.path.isdir(subdir_path):
-                    fout = osp.join(
+                    model = eval_model_abbr
                        output_dir,
                        'judged-by--' + judge_model + '-dimension.csv')
                fout2 = osp.join(
                    output_dir,
                    'judged-by--' + judge_model + '-capability.csv')
                for dataset in dataset_cfgs:
                    judged_answers, references = get_judgeanswer_and_reference(
                        dataset, subdir_path, self.judge_function)
                    if self.judge_type == 'general':
-                        get_dimension_results(judged_answers, references, fout,
+                        fout = osp.join(
-                                              fout_flag, model)
+                            output_dir,
-                        fout_flag += 1
+                            'judged-by--' + judge_abbr + '-dimension.csv')
-                    get_capability_results(judged_answers, references, fout2,
+                    fout2 = osp.join(
-                                           fout_flag2, model, self.category)
+                        output_dir,
-                    fout_flag2 += 1
+                        'judged-by--' + judge_abbr + '-capability.csv')
-            else:
+                    for dataset in dataset_cfgs:
-                print(subdir_path + ' is not exist! please check!')
+                        judged_answers, references = get_judgeanswer_and_reference(
                            dataset, subdir_path, self.judge_function)
                        if self.judge_type == 'general':
                            get_dimension_results(judged_answers, references,
                                                  fout, fout_flag, model)
                            fout_flag += 1
                        get_capability_results(judged_answers, references,
                                               fout2, fout_flag2, model,
                                               self.category)
                        fout_flag2 += 1
                else:
                    print(subdir_path + ' is not exist! please check!')
        if self.judge_type == 'general':
            with open(fout, 'r') as f:
                x = from_csv(f)
--- a/opencompass/summarizers/subjective/alpacaeval.py
+++ b/opencompass/summarizers/subjective/alpacaeval.py
@ -82,7 +82,8 @@ class AlpacaSummarizer:
        self.cfg = config
        self.base_models = self.cfg['eval']['partitioner']['base_models']
        self.compare_models = self.cfg['eval']['partitioner']['compare_models']
-        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
+        self.judge_abbr = model_abbr_from_cfg(
            self.cfg['judge_models'][0])  # We will reorganize the summarizers
        self.judge_type = judge_type
        assert self.judge_type in ['v1', 'v2']
        self.judge_map = {
--- a/opencompass/summarizers/subjective/compass_arena.py
+++ b/opencompass/summarizers/subjective/compass_arena.py
@ -67,7 +67,9 @@ class CompassArenaSummarizer:
        self.cfg = config
        self.base_models = self.cfg['eval']['partitioner']['base_models']
        self.compare_models = self.cfg['eval']['partitioner']['compare_models']
-        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
+        self.judge_models = self.cfg.get('judge_models', None)
        self.meta_judge_model = self.cfg.eval.partitioner.get(
            'meta_judge_model', None)
        self.judge_type = judge_type
        assert self.judge_type in ['general']
        self.judge_map = {
@ -95,109 +97,135 @@ class CompassArenaSummarizer:
            product(self.base_models, self.compare_models))
        unique_combinations = remove_duplicate_pairs(
            [combo for combo in model_combinations if combo[0] != combo[1]])
-        judge_model = self.judge_abbr
+
        fout_list = []
-        for dataset in dataset_cfgs:
+        pre_len = len(self.judge_models)
-            dataset_abbr = dataset_abbr_from_cfg(dataset)
+        if self.meta_judge_model is not None:
-            fout = osp.join(
+            self.judge_models.append(self.meta_judge_model)
-                output_dir, 'judged-by--' + judge_model + '-' + dataset_abbr +
+            meta_judge_model_abbr = model_abbr_from_cfg(self.meta_judge_model)
-                '-report.csv')
+        else:
-            fout_list.append(fout)
+            meta_judge_model_abbr = None
-            for model_pair in unique_combinations:
+        for idx, judge_model in enumerate(self.judge_models):
-                model1, model2, = model_pair[0]['abbr'], model_pair[1]['abbr'],
+            judge_model = model_abbr_from_cfg(judge_model)
-                subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
+            for dataset in dataset_cfgs:
-                subdir_path = os.path.join(results_folder, subdir)
+                dataset_abbr = dataset_abbr_from_cfg(dataset)
-                if os.path.isdir(subdir_path):
+                if idx == pre_len:
-                    judged_answers, references = get_judgeanswer_and_reference(
+                    fout = osp.join(
-                        dataset,
+                        output_dir, 'summarized-by--' + judge_model + '-' +
-                        subdir_path,
+                        dataset_abbr + '-report.csv')
                        self.judge_function,
                    )
                    if self.check_pos_bias:
                        bias_num = check_position_bias(judged_answers,
                                                       references)
                    else:
                        bias_num = 0
                    win_model1, win_model2, categories = defaultdict(
                        float), defaultdict(float), defaultdict(float)
                    model1, model2 = references[0]['answer1'], references[0][
                        'answer2']
                    for prediction, reference in zip(judged_answers,
                                                     references):
                        if self.summary_type == 'single':
                            if prediction == 'A':
                                categories['total'] += 1
                                categories[reference['capability']] += 1
                                if reference['answer1'] == model1:
                                    win_model1[reference['capability']] += 1
                                    win_model1['total'] += 1
                                else:
                                    win_model2[reference['capability']] += 1
                                    win_model2['total'] += 1
                            elif prediction == 'B':
                                categories['total'] += 1
                                categories[reference['capability']] += 1
                                if reference['answer1'] == model1:
                                    win_model2[reference['capability']] += 1
                                    win_model2['total'] += 1
                                else:
                                    win_model1[reference['capability']] += 1
                                    win_model1['total'] += 1
                        elif self.summary_type == 'half_add':
                            categories['total'] += 1
                            categories[reference['capability']] += 1
                            if prediction == 'A':
                                if reference['answer1'] == model1:
                                    win_model1[reference['capability']] += 1
                                    win_model1['total'] += 1
                                else:
                                    win_model2[reference['capability']] += 1
                                    win_model2['total'] += 1
                            elif prediction == 'B':
                                if reference['answer1'] == model1:
                                    win_model2[reference['capability']] += 1
                                    win_model2['total'] += 1
                                else:
                                    win_model1[reference['capability']] += 1
                                    win_model1['total'] += 1
                            elif prediction == 'C':
                                win_model1[reference['capability']] += 0.5
                                win_model1['total'] += 0.5
                                win_model2[reference['capability']] += 0.5
                                win_model2['total'] += 0.5
                    for capability in categories:
                        if capability not in win_model1:
                            win_model1[capability] = 0.0
                        else:
                            win_model1[capability] = round(
                                (win_model1[capability] /
                                 categories[capability]) * 100, 2)
                        if capability not in win_model2:
                            win_model2[capability] = 0.0
                        else:
                            win_model2[capability] = round(
                                (win_model2[capability] /
                                 categories[capability]) * 100, 2)
                    win_model1['position_bias'] = bias_num
                    win_model2['position_bias'] = bias_num
                    scores = {
                        'win_' + model1: win_model1,
                        'win_' + model2: win_model2
                    }
                    rows = list(scores.keys())
                    columns = list(scores[rows[0]].keys())
                    columns.insert(0, columns.pop(columns.index('total')))
                    columns.insert(1,
                                   columns.pop(columns.index('position_bias')))
                    with open(fout, 'a+', newline='') as csvfile:
                        writer = csv.writer(csvfile)
                        writer.writerow([model1 + '_vs_' + model2] + columns)
                        for row in rows:
                            writer.writerow(
                                [row] +
                                [scores[row][column] for column in columns])
                else:
-                    print(subdir_path + ' is not exist! please check!')
+                    fout = osp.join(
                        output_dir, 'judged-by--' + judge_model + '-' +
                        dataset_abbr + '-report.csv')
                fout_list.append(fout)
                for model_pair in unique_combinations:
                    model1, model2, = model_pair[0]['abbr'], model_pair[1][
                        'abbr'],
                    if idx == pre_len:
                        subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
                    else:
                        subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
                    subdir_path = os.path.join(results_folder, subdir)
                    if os.path.isdir(subdir_path):
                        judged_answers, references = get_judgeanswer_and_reference(
                            dataset,
                            subdir_path,
                            self.judge_function,
                        )
                        if self.check_pos_bias:
                            bias_num = check_position_bias(
                                judged_answers, references)
                        else:
                            bias_num = 0
                        win_model1, win_model2, categories = defaultdict(
                            float), defaultdict(float), defaultdict(float)
                        model1, model2 = references[0]['answer1'], references[
                            0]['answer2']
                        for prediction, reference in zip(
                                judged_answers, references):
                            if self.summary_type == 'single':
                                if prediction == 'A':
                                    categories['total'] += 1
                                    categories[reference['capability']] += 1
                                    if reference['answer1'] == model1:
                                        win_model1[
                                            reference['capability']] += 1
                                        win_model1['total'] += 1
                                    else:
                                        win_model2[
                                            reference['capability']] += 1
                                        win_model2['total'] += 1
                                elif prediction == 'B':
                                    categories['total'] += 1
                                    categories[reference['capability']] += 1
                                    if reference['answer1'] == model1:
                                        win_model2[
                                            reference['capability']] += 1
                                        win_model2['total'] += 1
                                    else:
                                        win_model1[
                                            reference['capability']] += 1
                                        win_model1['total'] += 1
                            elif self.summary_type == 'half_add':
                                categories['total'] += 1
                                categories[reference['capability']] += 1
                                if prediction == 'A':
                                    if reference['answer1'] == model1:
                                        win_model1[
                                            reference['capability']] += 1
                                        win_model1['total'] += 1
                                    else:
                                        win_model2[
                                            reference['capability']] += 1
                                        win_model2['total'] += 1
                                elif prediction == 'B':
                                    if reference['answer1'] == model1:
                                        win_model2[
                                            reference['capability']] += 1
                                        win_model2['total'] += 1
                                    else:
                                        win_model1[
                                            reference['capability']] += 1
                                        win_model1['total'] += 1
                                elif prediction == 'C':
                                    win_model1[reference['capability']] += 0.5
                                    win_model1['total'] += 0.5
                                    win_model2[reference['capability']] += 0.5
                                    win_model2['total'] += 0.5
                        for capability in categories:
                            if capability not in win_model1:
                                win_model1[capability] = 0.0
                            else:
                                win_model1[capability] = round(
                                    (win_model1[capability] /
                                     categories[capability]) * 100, 2)
                            if capability not in win_model2:
                                win_model2[capability] = 0.0
                            else:
                                win_model2[capability] = round(
                                    (win_model2[capability] /
                                     categories[capability]) * 100, 2)
                        win_model1['position_bias'] = bias_num
                        win_model2['position_bias'] = bias_num
                        scores = {
                            'win_' + model1: win_model1,
                            'win_' + model2: win_model2
                        }
                        rows = list(scores.keys())
                        columns = list(scores[rows[0]].keys())
                        columns.insert(0, columns.pop(columns.index('total')))
                        columns.insert(
                            1, columns.pop(columns.index('position_bias')))
                        with open(fout, 'a+', newline='') as csvfile:
                            writer = csv.writer(csvfile)
                            writer.writerow([model1 + '_vs_' + model2] +
                                            columns)
                            for row in rows:
                                writer.writerow([row] + [
                                    scores[row][column] for column in columns
                                ])
                    else:
                        print(subdir_path + ' is not exist! please check!')
        for fout in fout_list:
            with open(fout, 'r') as f:
                x = from_csv(f)
--- a/opencompass/summarizers/subjective/mtbench.py
+++ b/opencompass/summarizers/subjective/mtbench.py
@ -98,7 +98,7 @@ class MTBenchSummarizer(CompassArenaSummarizer):
            self.base_models = self.cfg['eval']['partitioner']['base_models']
            self.compare_models = self.cfg['eval']['partitioner'][
                'compare_models']
-        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
+        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
        self.judge_map = {
            'single': post_process_mtbench_single,
            'pair': post_process_mtbench_pair
--- a/opencompass/tasks/subjective_eval.py
+++ b/opencompass/tasks/subjective_eval.py
@ -1,10 +1,11 @@
 # flake8: noqa: E501
 import argparse
 import copy
 import fnmatch
 import os.path as osp
 import random
 import time
-from typing import List, Union
+from typing import List, Optional, Union
 import mmengine
 from mmengine.config import Config, ConfigDict
@ -14,6 +15,7 @@ from opencompass.registry import ICL_EVALUATORS, MODELS, TEXT_POSTPROCESSORS
 from opencompass.tasks.base import BaseTask
 from opencompass.tasks.openicl_eval import extract_role_pred
 from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
                               deal_with_judge_model_abbr,
                               get_infer_output_path, get_logger,
                               model_abbr_from_cfg, task_abbr_from_cfg)
@ -35,21 +37,25 @@ class SubjectiveEvalTask(BaseTask):
    def __init__(self, cfg: ConfigDict):
        super().__init__(cfg)
        self.logger = get_logger()
-        judge_cfg = cfg.eval.runner.task.get('judge_cfg', {})
+        judge_cfg = cfg.get('judge_model', None)
-        if type(judge_cfg) != ConfigDict:
+        meta_judge_cfg = cfg.get('meta_judge_model', None)
-            print('*' * 100)
+        judge_models = cfg.get('judge_models', None)
-            print('Due to different Judge model needs different summarizer and'
+
-                  " prompts, we don't support multi judge model evaluation at "
+        if judge_cfg is None and meta_judge_cfg is None:
-                  'one time, please do not use list to set your judge cfg, jus'
+            assert judge_cfg is not None, 'Both judge_cfg and meta_judge_cfg are None, but judge_models must be provided.'
-                  't use a dict or list[0] should be fine. If you want to eval'
+
-                  'uation multi judge model in one script, we suggest you to u'
+        if meta_judge_cfg is not None:
-                  'se a bash or bat script to start multi configs evaluation!')
+            assert judge_models is not None, 'meta_judge_cfg is provided, but judge_models are missing.'
-            print('*' * 100)
+            judge_cfg = meta_judge_cfg  # Relpace judge_cfg to meta_judge_cfg when it is not None
-        assert type(judge_cfg) == ConfigDict
+            self.meta = True
        else:
            self.meta = False
        run_cfg = judge_cfg.get('run_cfg', {})
        self.num_gpus = run_cfg.get('num_gpus', 0)
        self.num_procs = run_cfg.get('num_procs', 1)
        self.judge_cfg = copy.deepcopy(judge_cfg)
        self.judge_models = judge_models
        self.infer_order = cfg.get('infer_order')
        self.given_pred = cfg.eval.get('given_pred', [])
    def get_command(self, cfg_path, template):
@ -78,17 +84,15 @@ class SubjectiveEvalTask(BaseTask):
                # Load Dataset
                eval_cfg = dataset_cfg.get('eval_cfg')
                output_column = dataset_cfg['reader_cfg']['output_column']
                if type(model_cfg) == ConfigDict:
                    model_cfg = (model_cfg, )
                model_cfg += ({
                    'abbr':
                    'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
                }, )
                out_path = get_infer_output_path(
-                    model_cfg, dataset_cfg, osp.join(self.work_dir, 'results'))
+                    deal_with_judge_model_abbr(model_cfg, self.judge_cfg,
                                               self.meta), dataset_cfg,
                    osp.join(self.work_dir, 'results'))
                if osp.exists(out_path):
                    continue
-                self._score(model_cfg, dataset_cfg, eval_cfg, output_column)
+
                self._score(model_cfg, dataset_cfg, eval_cfg, output_column,
                            self.meta)
    def _load_model_pred(
        self,
@ -194,7 +198,139 @@ class SubjectiveEvalTask(BaseTask):
            'model_preds': pred_strs
        }
-    def _score(self, model_cfg, dataset_cfg, eval_cfg, output_column):
+    def _load_model_judgements(
        self,
        model_cfg: Union[ConfigDict, List[ConfigDict]],
        dataset_cfg: ConfigDict,
        eval_cfg: ConfigDict,
        judge_cfg: Union[ConfigDict, List[ConfigDict]],
    ) -> Union[None, List[str]]:
        if isinstance(judge_cfg, (tuple, list)):
            return [
                self._load_model_judgements(model_cfg, dataset_cfg, eval_cfg,
                                            j) for j in judge_cfg
            ]
        pred_strs = None
        model_cfg = [model_cfg] if isinstance(model_cfg,
                                              ConfigDict) else model_cfg
        # There will be 5 situations, so we need to deal with them
        # 1.There are no partitions in infer and judge stage
        # 2.No partition in infer stage, but use partition in judge stage
        # 3.Use partition in infer stage, but not use partition in judge stage
        # 4.Use both partition, with same partition size
        # 5.Use both partition, but different partition size
        # If take SubjectSizePartition, get new filename without _0
        if 'test_range' in dataset_cfg['reader_cfg']:
            filename = get_infer_output_path(
                deal_with_judge_model_abbr([m for m in model_cfg], judge_cfg),
                dataset_cfg, osp.join(self.work_dir, 'results'))
            root, ext = osp.splitext(filename)
            last_underscore_index = root.rfind('_')
            root = root[:last_underscore_index]
            filename = root + ext
        # If take SubjectNaivePartition, get filename
        else:
            filename = get_infer_output_path(
                deal_with_judge_model_abbr([m for m in model_cfg], judge_cfg),
                dataset_cfg, osp.join(self.work_dir, 'results'))
        # Get partition name
        root, ext = osp.splitext(filename)
        partial_filename = root + '_0' + ext
        # If no predictions get in predictions dir
        if not osp.exists(osp.realpath(filename)) and not osp.exists(
                osp.realpath(partial_filename)):
            return {'error': 'No judgements found.'}
        else:
            # If use Naive partition in infer stage
            if osp.exists(osp.realpath(filename)):
                preds = mmengine.load(filename)
                pred_strs = [
                    preds[str(i)]['prediction'] for i in range(len(preds))
                ]
            # If use Size partition in infer stage
            else:
                filename = partial_filename
                pred_strs = []
                i = 1
                while osp.exists(osp.realpath(filename)):
                    preds = mmengine.load(filename)
                    filename = root + f'_{i}' + ext
                    i += 1
                    pred_strs += [
                        preds[str(i)]['prediction'] for i in range(len(preds))
                    ]
        # Get all judgements in pred_strs
        # If take SubjectSizePartition, get new pred_strs based on test_range
        if 'test_range' in dataset_cfg['reader_cfg']:
            test_range = dataset_cfg['reader_cfg']['test_range']
            if self.infer_order == 'double':
                # When set infer_order as double, we need to select the judgements to meet the predctions which will be doubled later
                start = 0
                end = None
                pred_strs_length = len(pred_strs)
                # Split the string by the ':', the test_range is a string shapes like '[0:15]'
                parts = test_range.strip('[]').split(':')
                # Check if the start index is provided
                if parts[0]:
                    start = int(parts[0])
                # Check if the end index is provided
                if len(parts) > 1 and parts[1]:
                    end = int(parts[1])
                else:
                    # If the end is not provided, determine the default end based on the length of 'pred_strs'
                    end = int(pred_strs_length / 2)
                assert pred_strs_length % 2 == 0, "Since you have set the infer_order as 'double', the length of 'pred_strs' must be even."
                assert end <= pred_strs_length / 2, "The 'end' value must not exceed half of the 'pred_strs' length."
                # Reset the newly start and end
                start *= 2
                end *= 2
                pred_strs = eval('pred_strs[' + str(start) + ':' + str(end) +
                                 ']')
            else:
                pred_strs = eval('pred_strs' + test_range)
        # If take SubjectNaivePartition, get all pred_strs
        else:
            pred_strs = pred_strs
        if ('pred_role' in eval_cfg and 'meta_template' in judge_cfg
                and not MODELS.get(judge_cfg['type']).is_api
                and isinstance(pred_strs[0], str)):
            # Create a prompt template for role config parsing
            from opencompass.models.base import LMTemplateParser
            parser = LMTemplateParser(judge_cfg['meta_template'])
            role = parser.roles[eval_cfg['pred_role']]
            pred_strs = [
                extract_role_pred(pred, role.get('begin', None),
                                  role.get('end', None)) for pred in pred_strs
            ]
        # Postprocess predictions if necessary
        ds_abbr = dataset_abbr_from_cfg(dataset_cfg)
        model_postprocessors = judge_cfg.get('pred_postprocessor', {})
        pred_postprocessor = None
        for pattern in model_postprocessors.keys():
            if fnmatch.fnmatch(ds_abbr, pattern):
                pred_postprocessor = model_postprocessors[pattern]
                break
        if 'pred_postprocessor' in eval_cfg or pred_postprocessor:
            kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
            proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
            pred_strs = [proc(s, **kwargs) for s in pred_strs]
        return {
            'model_name': model_abbr_from_cfg(judge_cfg),
            'model_preds': pred_strs
        }
    def _score(self,
               model_cfg,
               dataset_cfg,
               eval_cfg,
               output_column,
               meta=False):
        test_set = build_dataset_from_cfg(dataset_cfg).test
        # Postprocess dataset if necessary
        if 'dataset_postprocessor' in eval_cfg:
@ -208,27 +344,32 @@ class SubjectiveEvalTask(BaseTask):
            test_set = test_set.map(postprocess)
        # Get out_path
-        out_path = get_infer_output_path(model_cfg, dataset_cfg,
+        out_path = get_infer_output_path(
-                                         osp.join(self.work_dir, 'results'))
+            deal_with_judge_model_abbr(model_cfg, self.judge_cfg, self.meta),
-        new_model_cfg = []
+            dataset_cfg, osp.join(self.work_dir, 'results'))
-        for m_cfg in model_cfg:
+        if meta:
-            if len(m_cfg) > 1:
+            model_preds = self._load_model_pred(model_cfg, dataset_cfg,
-                new_model_cfg.append(m_cfg)
+                                                eval_cfg, self.given_pred)
-        if len(new_model_cfg) == 1:
+            model_judges = self._load_model_judgements(model_cfg, dataset_cfg,
-            new_model_cfg = new_model_cfg[0]
+                                                       eval_cfg,
-        model_preds = self._load_model_pred(new_model_cfg, dataset_cfg,
+                                                       self.judge_models)
-                                            eval_cfg, self.given_pred)
+        else:
            model_preds = self._load_model_pred(model_cfg, dataset_cfg,
                                                eval_cfg, self.given_pred)
            model_judges = None
        if not self.judge_cfg:
-            raise ValueError('missing "eval.runner.task.judge_cfg"')
+            raise ValueError('missing "eval.judge_cfg"')
        eval_cfg['evaluator']['judge_cfg'] = self.judge_cfg
        eval_cfg['evaluator']['dataset_cfg'] = dataset_cfg
        eval_cfg['evaluator']['output_path'] = out_path
        icl_evaluator = ICL_EVALUATORS.build(eval_cfg['evaluator'])
        references = (test_set[output_column] if output_column else None)
        if 'error' not in model_preds:
            result = icl_evaluator.score(predictions=model_preds,
-                                         references=references)
+                                         judgements=model_judges,
                                         references=references,
                                         meta=meta,
                                         infer_order=self.infer_order)
        else:
            result = model_preds
@ -259,17 +400,24 @@ class SubjectiveEvalTask(BaseTask):
        output_paths = []
        for model, datasets in zip(self.model_cfgs, self.dataset_cfgs):
            for dataset in datasets:
-                if type(model) == ConfigDict:
+                if isinstance(model, ConfigDict):
                    model = (model, )
-                model += ({
+                if self.meta:
-                    'abbr':
+                    model += ({
-                    'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
+                        'abbr':
-                }, )
+                        'summarized-by--' + model_abbr_from_cfg(self.judge_cfg)
                    }, )
                else:
                    model += ({
                        'abbr':
                        'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
                    }, )
                output_paths.append(
                    get_infer_output_path(
                        model, dataset,
                        osp.join(self.work_dir, self.output_subdir),
                        file_extension))
                model = model[:-1]
        return output_paths
--- a/opencompass/utils/abbr.py
+++ b/opencompass/utils/abbr.py
@ -46,3 +46,25 @@ def get_infer_output_path(model_cfg: ConfigDict,
    model_abbr = model_abbr_from_cfg(model_cfg)
    dataset_abbr = dataset_abbr_from_cfg(dataset_cfg)
    return osp.join(root_path, model_abbr, f'{dataset_abbr}.{file_extension}')
 def deal_with_judge_model_abbr(model_cfg, judge_model_cfg, meta=False):
    if isinstance(model_cfg, ConfigDict):
        model_cfg = (model_cfg, )
    if meta:
        for m_cfg in model_cfg:
            if 'summarized-by--' in m_cfg['abbr']:
                return model_cfg
        model_cfg += ({
            'abbr':
            'summarized-by--' + model_abbr_from_cfg(judge_model_cfg)
        }, )
    else:
        for m_cfg in model_cfg:
            if 'judged-by--' in m_cfg['abbr']:
                return model_cfg
        model_cfg += ({
            'abbr':
            'judged-by--' + model_abbr_from_cfg(judge_model_cfg)
        }, )
    return model_cfg
--- a/run.py
+++ b/run.py
@ -341,7 +341,14 @@ def main():
        if args.dry_run:
            return
        runner = RUNNERS.build(cfg.eval.runner)
-        runner(tasks)
+
        # For meta-review-judge in subjective evaluation
        if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
                tasks[0], list):
            for task_part in tasks:
                runner(task_part)
        else:
            runner(tasks)
    # visualize
    if args.mode in ['all', 'eval', 'viz']: