[Feature] Added Bradley-Terry subjective evaluation

2025-05-30 16:03:24 +08:00 · 2024-12-31 11:01:23 +08:00 · 2024-12-31 11:01:23 +08:00 · dc6035cfcb
commit dc6035cfcb
parent 98435dd98e
13 changed files with 1230 additions and 45 deletions
--- a/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py
+++ b/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py
@ -0,0 +1,125 @@
+from mmengine.config import read_base
+
+from opencompass.datasets import AlpacaEvalDataset, alpacaeval_bradleyterry_postprocess
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+)
+
+subjective_all_sets = [
+    'alpaca_eval',
+]
+
+
+alpacav2_datasets = []
+
+gpt4_prompt = """
+I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
+
+## Instruction
+
+{
+    "instruction": "{question}",
+}
+
+## Model Outputs
+
+Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
+
+{
+    {
+        "model_identifier": "m",
+        "output": "{prediction}"
+    },
+    {
+        "model_identifier": "M",
+        "output": "{prediction2}"
+    }
+}
+
+## Task
+
+Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
+
+## Best Model Identifier
+"""
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+gpt4 = [
+    dict(
+        abbr='gpt4-turbo',
+    )
+]
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{question}'),
+                ]
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=4096),
+    )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.',
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=gpt4_prompt),
+                    ],
+                ),
+            ),
+            dict_postprocessor=dict(
+                type=alpacaeval_bradleyterry_postprocess,
+            ),
+            keep_predictions=True,  # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
+        ),
+        pred_role='BOT',
+    )
+
+    alpacav2_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=AlpacaEvalDataset,
+            path='./data/subjective/alpaca_eval',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='random',
+            base_models=gpt4,
+            given_pred=[
+                {
+                    'abbr': 'gpt4-turbo',
+                    'path': './data/subjective/alpaca_eval/gpt4-turbo',
+                }
+            ],
+        )
+    )
--- a/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py
+++ b/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py
@ -0,0 +1,173 @@
+from opencompass.datasets import (
+    CompassArenaDataset,
+    compassarena_bradleyterry_postprocess,
+)
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.summarizers import CompassArenaSummarizer
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'ref'],
+    output_column='judge',
+)
+
+data_path = 'data/subjective/compass_arena'
+
+compassarena_datasets = []
+
+base_prompt = """
+[回答1开始]
+{prediction}
+[回答1结束]
+[回答2开始]
+{prediction2}
+[回答2结束]
+根据评分要求，在以下 3 个选项中做出选择:
+A. 回答1更好
+B. 回答2更好
+C. 回答1、2平局
+并提供你的解释原因。
+如果你认为回答1更好，你的输出应形如：
+选择：A
+原因：blahblah blahblah\n
+如果你认为回答2更好，你的输出应形如：
+选择：B
+原因：blahblah blahblah\n
+如果你认为回答1、2打成平手，你的输出应形如：
+选择：C
+原因：blahblah blahblah\n
+"""
+
+knowledge_prompt = (
+    """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答能与参考答案吻合或表明参考答案的意思。
+2. 在都准确答对问题的前提下，更好的回答能对知识点进行额外补充，且补充的知识准确无误。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+"""
+    + base_prompt
+)
+
+
+language_prompt = (
+    """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 在有明确的参考答案的情况下，越贴近参考答案或表明了参考答案的意思的回答越好。
+2. 更好的回答在语言表达上更流畅，更加符合与人类对话的习惯，包括语气、情调等
+3. 在都准确答对问题的前提下，更好的回答能进行额外补充，且补充的内容准确无误。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+"""
+    + base_prompt
+)
+
+
+math_prompt = (
+    """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答的答案能和参考答案一致。
+2. 若两个回答的答案都与参考答案不一致，则更好的回答的推理过程应更加合理。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+"""
+    + base_prompt
+)
+
+reason_prompt = math_prompt
+
+creation_prompt = (
+    """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
+2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
+3. 好的回答必须具有创造性的词语和表达丰富度
+[用户问题]
+{question}
+"""
+    + base_prompt
+)
+
+sub_map = {
+    'language': language_prompt,
+    'knowledge': knowledge_prompt,
+    'reason_v2': reason_prompt,
+    'math_v2': math_prompt,
+    'creationv2_zh': creation_prompt,
+}
+
+gpt4 = [
+    dict(
+        abbr='gpt4-turbo',
+    )
+]
+
+for _name, _prompt in sub_map.items():
+    subjective_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{question}'),
+                ]
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
+    )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt=_prompt),
+                    ]
+                ),
+            ),
+            dict_postprocessor=dict(
+                type=compassarena_bradleyterry_postprocess,
+                count_ties=True,
+            ),
+            keep_predictions=True,  # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
+        ),
+        pred_role='BOT',
+    )
+
+    compassarena_datasets.append(
+        dict(
+            abbr=f'compassarena_{_name}',
+            type=CompassArenaDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='random',
+            base_models=gpt4,
+            # summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
+            given_pred=[
+                {
+                    'abbr': 'gpt4-turbo',
+                    'path': './data/subjective/compass_arena/gpt4-turbo',
+                }
+            ],
+        )
+    )
--- a/configs/datasets/subjective/wildbench/wildbench_pair_judge_bradleyterry.py
+++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge_bradleyterry.py
@ -0,0 +1,74 @@
+from opencompass.datasets import WildBenchDataset, wildbench_bradleyterry_postprocess
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.summarizers import WildBenchPairSummarizer
+
+subjective_reader_cfg = dict(
+    input_columns=['dialogue', 'prompt'],
+    output_column='judge',
+)
+
+
+data_path = './data/subjective/WildBench/wildbench.jsonl'
+
+wildbench_datasets = []
+subjective_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template="""{dialogue}"""),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=ChatInferencer, max_seq_len=32768, infer_mode='last'),
+)
+
+subjective_eval_cfg = dict(
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(type=PromptTemplate, template="""{prompt}"""),
+        dict_postprocessor=dict(type=wildbench_bradleyterry_postprocess),
+        keep_predictions=True,  # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
+    ),
+    pred_role='BOT',
+)
+
+base_models = [
+    dict(
+        abbr='gpt4-turbo',
+    ),
+    dict(
+        abbr='HaiKu',
+    ),
+    dict(
+        abbr='llama-2-70b-chat-hf',
+    ),
+]
+
+wildbench_datasets.append(
+    dict(
+        abbr='wildbench',
+        type=WildBenchDataset,
+        path=data_path,
+        eval_mode='pair',
+        reader_cfg=subjective_reader_cfg,
+        infer_cfg=subjective_infer_cfg,
+        eval_cfg=subjective_eval_cfg,
+        given_pred=[
+            {'abbr': 'gpt4-turbo', 'path': './data/subjective/WildBench/gpt4'},
+            {
+                'abbr': 'llama-2-70b-chat-hf',
+                'path': './data/subjective/WildBench/llama2-70b',
+            },
+            {'abbr': 'HaiKu', 'path': './data/subjective/WildBench/claude'},
+            {
+                'abbr': 'llama-2-70b-chat-turbomind',
+                'path': './data/subjective/WildBench/llama2-70b',
+            },
+            {
+                'abbr': 'llama-2-70b-chat-vllm',
+                'path': './data/subjective/WildBench/llama2-70b',
+            },
+        ],
+        mode='m2n',  # m个模型 与 n个模型进行对战
+        infer_order='random',
+        base_models=base_models,
+    )
+)
--- a/configs/eval_subjective_bradleyterry.py
+++ b/configs/eval_subjective_bradleyterry.py
@ -0,0 +1,126 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_bradleyterry import (
+        alpacav2_datasets,
+    )
+    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_bradleyterry import (
+        compassarena_datasets,
+    )
+
+    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_bradleyterry import (
+        wildbench_datasets,
+    )
+
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
+        models as lmdeploy_internlm2_5_7b_chat,
+    )
+
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
+        models as lmdeploy_internlm2_5_20b_chat,
+    )
+
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct,
+    )
+
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as lmdeploy_qwen2_5_14b_instruct,
+    )
+
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
+        models as lmdeploy_qwen2_7b_instruct,
+    )
+
+from opencompass.models import (
+    HuggingFace,
+    HuggingFaceCausalLM,
+    HuggingFaceChatGLM3,
+    OpenAI,
+    TurboMindModelwithChatTemplate,
+)
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.summarizers import (
+    CompassArenaBradleyTerrySummarizer,
+    SubjectiveSummarizer,
+)
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
+
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+models = [
+    *lmdeploy_internlm2_5_7b_chat,
+    *lmdeploy_internlm2_5_20b_chat,
+    *lmdeploy_qwen2_5_14b_instruct,
+    *lmdeploy_qwen2_5_7b_instruct,
+    *lmdeploy_qwen2_7b_instruct,
+]
+
+
+datasets = [
+    *alpacav2_datasets,
+    *compassarena_datasets,
+    *wildbench_datasets,
+]
+
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)),
+)
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+judge_models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='CompassJudger-1-32B-Instruct',
+        path='opencompass/CompassJudger-1-32B-Instruct',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
+
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(
+        type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)
+    ),
+)
+
+## ------------- Summary Configuration
+# This step fits a Bradley-Terry model (statistical model) with an option
+# to include style features and control variables based on groups
+# (group variables must be available in the input dataset for each observation).
+summarizer = dict(
+    type=CompassArenaBradleyTerrySummarizer,
+    rating_system='bradleyterry',
+    num_bootstrap=100,
+    num_cpu=None,
+    with_control_vars=True,
+    normalize_style_features=False,
+    odds_ratio=True,
+)
+
+work_dir = 'outputs/subjective/bradleyterry'
--- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py
+++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py
@ -0,0 +1,125 @@
+from mmengine.config import read_base
+
+from opencompass.datasets import AlpacaEvalDataset, alpacaeval_bradleyterry_postprocess
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+)
+
+subjective_all_sets = [
+    'alpaca_eval',
+]
+
+
+alpacav2_datasets = []
+
+gpt4_prompt = """
+I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
+
+## Instruction
+
+{
+    "instruction": "{question}",
+}
+
+## Model Outputs
+
+Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
+
+{
+    {
+        "model_identifier": "m",
+        "output": "{prediction}"
+    },
+    {
+        "model_identifier": "M",
+        "output": "{prediction2}"
+    }
+}
+
+## Task
+
+Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
+
+## Best Model Identifier
+"""
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+gpt4 = [
+    dict(
+        abbr='gpt4-turbo',
+    )
+]
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{question}'),
+                ]
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=4096),
+    )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.',
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=gpt4_prompt),
+                    ],
+                ),
+            ),
+            dict_postprocessor=dict(
+                type=alpacaeval_bradleyterry_postprocess,
+            ),
+            keep_predictions=True,  # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
+        ),
+        pred_role='BOT',
+    )
+
+    alpacav2_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=AlpacaEvalDataset,
+            path='./data/subjective/alpaca_eval',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='random',
+            base_models=gpt4,
+            given_pred=[
+                {
+                    'abbr': 'gpt4-turbo',
+                    'path': './data/subjective/alpaca_eval/gpt4-turbo',
+                }
+            ],
+        )
+    )
--- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py
+++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py
@ -0,0 +1,173 @@
+from opencompass.datasets import (
+    CompassArenaDataset,
+    compassarena_bradleyterry_postprocess,
+)
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.summarizers import CompassArenaSummarizer
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'ref'],
+    output_column='judge',
+)
+
+data_path = 'data/subjective/compass_arena'
+
+compassarena_datasets = []
+
+base_prompt = """
+[回答1开始]
+{prediction}
+[回答1结束]
+[回答2开始]
+{prediction2}
+[回答2结束]
+根据评分要求，在以下 3 个选项中做出选择:
+A. 回答1更好
+B. 回答2更好
+C. 回答1、2平局
+并提供你的解释原因。
+如果你认为回答1更好，你的输出应形如：
+选择：A
+原因：blahblah blahblah\n
+如果你认为回答2更好，你的输出应形如：
+选择：B
+原因：blahblah blahblah\n
+如果你认为回答1、2打成平手，你的输出应形如：
+选择：C
+原因：blahblah blahblah\n
+"""
+
+knowledge_prompt = (
+    """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答能与参考答案吻合或表明参考答案的意思。
+2. 在都准确答对问题的前提下，更好的回答能对知识点进行额外补充，且补充的知识准确无误。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+"""
+    + base_prompt
+)
+
+
+language_prompt = (
+    """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 在有明确的参考答案的情况下，越贴近参考答案或表明了参考答案的意思的回答越好。
+2. 更好的回答在语言表达上更流畅，更加符合与人类对话的习惯，包括语气、情调等
+3. 在都准确答对问题的前提下，更好的回答能进行额外补充，且补充的内容准确无误。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+"""
+    + base_prompt
+)
+
+
+math_prompt = (
+    """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答的答案能和参考答案一致。
+2. 若两个回答的答案都与参考答案不一致，则更好的回答的推理过程应更加合理。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+"""
+    + base_prompt
+)
+
+reason_prompt = math_prompt
+
+creation_prompt = (
+    """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
+2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
+3. 好的回答必须具有创造性的词语和表达丰富度
+[用户问题]
+{question}
+"""
+    + base_prompt
+)
+
+sub_map = {
+    'language': language_prompt,
+    'knowledge': knowledge_prompt,
+    'reason_v2': reason_prompt,
+    'math_v2': math_prompt,
+    'creationv2_zh': creation_prompt,
+}
+
+gpt4 = [
+    dict(
+        abbr='gpt4-turbo',
+    )
+]
+
+for _name, _prompt in sub_map.items():
+    subjective_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{question}'),
+                ]
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
+    )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt=_prompt),
+                    ]
+                ),
+            ),
+            dict_postprocessor=dict(
+                type=compassarena_bradleyterry_postprocess,
+                count_ties=True,
+            ),
+            keep_predictions=True,  # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
+        ),
+        pred_role='BOT',
+    )
+
+    compassarena_datasets.append(
+        dict(
+            abbr=f'compassarena_{_name}',
+            type=CompassArenaDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='random',
+            base_models=gpt4,
+            # summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
+            given_pred=[
+                {
+                    'abbr': 'gpt4-turbo',
+                    'path': './data/subjective/compass_arena/gpt4-turbo',
+                }
+            ],
+        )
+    )
--- a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_bradleyterry.py
+++ b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_bradleyterry.py
@ -0,0 +1,74 @@
+from opencompass.datasets import WildBenchDataset, wildbench_bradleyterry_postprocess
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.summarizers import WildBenchPairSummarizer
+
+subjective_reader_cfg = dict(
+    input_columns=['dialogue', 'prompt'],
+    output_column='judge',
+)
+
+
+data_path = './data/subjective/WildBench/wildbench.jsonl'
+
+wildbench_datasets = []
+subjective_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template="""{dialogue}"""),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=ChatInferencer, max_seq_len=32768, infer_mode='last'),
+)
+
+subjective_eval_cfg = dict(
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(type=PromptTemplate, template="""{prompt}"""),
+        dict_postprocessor=dict(type=wildbench_bradleyterry_postprocess),
+        keep_predictions=True,  # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
+    ),
+    pred_role='BOT',
+)
+
+base_models = [
+    dict(
+        abbr='gpt4-turbo',
+    ),
+    dict(
+        abbr='HaiKu',
+    ),
+    dict(
+        abbr='llama-2-70b-chat-hf',
+    ),
+]
+
+wildbench_datasets.append(
+    dict(
+        abbr='wildbench',
+        type=WildBenchDataset,
+        path=data_path,
+        eval_mode='pair',
+        reader_cfg=subjective_reader_cfg,
+        infer_cfg=subjective_infer_cfg,
+        eval_cfg=subjective_eval_cfg,
+        given_pred=[
+            {'abbr': 'gpt4-turbo', 'path': './data/subjective/WildBench/gpt4'},
+            {
+                'abbr': 'llama-2-70b-chat-hf',
+                'path': './data/subjective/WildBench/llama2-70b',
+            },
+            {'abbr': 'HaiKu', 'path': './data/subjective/WildBench/claude'},
+            {
+                'abbr': 'llama-2-70b-chat-turbomind',
+                'path': './data/subjective/WildBench/llama2-70b',
+            },
+            {
+                'abbr': 'llama-2-70b-chat-vllm',
+                'path': './data/subjective/WildBench/llama2-70b',
+            },
+        ],
+        mode='m2n',  # m个模型 与 n个模型进行对战
+        infer_order='random',
+        base_models=base_models,
+    )
+)
--- a/opencompass/datasets/subjective/init.py
+++ b/opencompass/datasets/subjective/init.py
@ -2,10 +2,14 @@
 from .alignbench import AlignmentBenchDataset  # noqa: F401, F403
 from .alignbench import alignbench_postprocess  # noqa: F401, F403
 from .alpacaeval import AlpacaEvalDataset  # noqa: F401, F403
+from .alpacaeval import alpacaeval_bradleyterry_postprocess  # noqa: F401, F403
 from .alpacaeval import alpacaeval_postprocess  # noqa: F401, F403
 from .arena_hard import ArenaHardDataset  # noqa: F401, F403
 from .arena_hard import arenahard_postprocess  # noqa: F401, F403
-from .compass_arena import CompassArenaDataset, compassarena_postprocess
+from .compass_arena import CompassArenaDataset  # noqa: F401, F403
+from .compass_arena import \
+    compassarena_bradleyterry_postprocess  # noqa: F401, F403
+from .compass_arena import compassarena_postprocess  # noqa: F401, F403
 from .compass_arena_subjective_bench import *
 from .compassbench import CompassBenchDataset  # noqa: F401, F403
 from .compassbench_checklist import \
@ -27,4 +31,5 @@ from .mtbench101 import mtbench101_postprocess
 from .multiround import MultiroundDataset  # noqa: F401, F403
 from .subjective_cmp import SubjectiveCmpDataset  # noqa: F401, F403
 from .wildbench import WildBenchDataset  # noqa: F401, F403
+from .wildbench import wildbench_bradleyterry_postprocess  # noqa: F401, F403
 from .wildbench import wildbench_postprocess  # noqa: F401, F403
--- a/opencompass/datasets/subjective/alpacaeval.py
+++ b/opencompass/datasets/subjective/alpacaeval.py
@ -5,6 +5,8 @@ from collections import defaultdict

 from datasets import Dataset, DatasetDict

+from opencompass.datasets.subjective.compass_arena_subjective_bench import \
+    get_element_counts
 from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
 from opencompass.utils import get_data_path

@ -33,7 +35,7 @@ class AlpacaEvalDataset(BaseDataset):
                    'judge': {
                        'capability': capability,
                        'question': question
-                    }
+                    },
                })
        dataset = Dataset.from_list(raw_data)
        return dataset
@ -64,33 +66,54 @@ def post_process_alpacav2(completion: str):


@DICT_POSTPROCESSORS.register_module('alpacaeval')
-def alpacaeval_postprocess(output: dict, output_path: str) -> dict:
+def alpacaeval_postprocess(
+    output: dict,
+    output_path: str,
+) -> dict:
+
    judged_answers, references = get_judgeanswer_and_reference(
-        output, output_path, post_process_alpacav2)
+        result=output,
+        filename=output_path,
+        post_process=post_process_alpacav2,
+    )

    if len(judged_answers) == 0:
        scores = None

-    win_model1, win_model2, categories = defaultdict(float), defaultdict(
-        float), defaultdict(float)
-    model1, model2 = references[0]['answer1'], references[0]['answer2']
-    for prediction, reference in zip(judged_answers, references):
+    win_model1, win_model2, categories = (
+        defaultdict(float),
+        defaultdict(float),
+        defaultdict(float),
+    )
+
+    if 'base_models' in references[0]:
+        base_models = references[0]['base_models']
+    else:
+        # TODO: Assuming the first model in the first record to be the base model
+        # Might not necessarily be the case if infer_order=="random"
+        base_models = [references[0]['answer1']]
+
+    if isinstance(base_models, str):
+        base_models = [base_models]
+
+    for judged_answer, reference in zip(judged_answers, references):
        categories['total'] += 1
        categories[reference['capability']] += 1
-        if prediction['rank'] == 1:
-            if reference['answer1'] == model1:
+        if judged_answer['rank'] == 1:
+            if reference['answer1'] in base_models:
                win_model1[reference['capability']] += 1
                win_model1['total'] += 1
            else:
                win_model2[reference['capability']] += 1
                win_model2['total'] += 1
        else:
-            if reference['answer1'] == model1:
+            if reference['answer1'] in base_models:
                win_model2[reference['capability']] += 1
                win_model2['total'] += 1
            else:
                win_model1[reference['capability']] += 1
                win_model1['total'] += 1
+
    for capability in categories:
        if capability not in win_model1:
            win_model1[capability] = 0.0
@ -106,3 +129,78 @@ def alpacaeval_postprocess(output: dict, output_path: str) -> dict:
    results = win_model2
    results['details'] = output
    return results
+
+
+@DICT_POSTPROCESSORS.register_module('alpacaeval_bradleyterry')
+def alpacaeval_bradleyterry_postprocess(
+    output: dict,
+    output_path: str,
+) -> dict:
+    judged_answers, references = get_judgeanswer_and_reference(
+        result=output,
+        filename=output_path,
+        post_process=post_process_alpacav2,
+    )
+
+    if 'prediction1' not in references[0]:
+        raise ValueError(
+            'prediction1 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
+        )
+
+    if 'prediction2' not in references[0]:
+        raise ValueError(
+            'prediction2 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
+        )
+
+    if 'base_models' in references[0]:
+        base_models = references[0]['base_models']
+    else:
+        # TODO: Assuming the first model in the first record to be the base model
+        # Might not necessarily be the case if infer_order=="random"
+        base_models = [references[0]['answer1']]
+
+    if isinstance(base_models, str):
+        base_models = [base_models]
+
+    results = {}
+    matches = []
+    for judged_answer, reference in zip(judged_answers, references):
+        cur_dict = {}
+
+        if judged_answer['rank'] == 1:
+            if reference['answer1'] in base_models:
+                cur_dict['winner'] = 'model_a'
+            else:
+                cur_dict['winner'] = 'model_b'
+        elif judged_answer['rank'] == 2:
+            if reference['answer1'] in base_models:
+                cur_dict['winner'] = 'model_b'
+            else:
+                cur_dict['winner'] = 'model_a'
+        else:
+            cur_dict['winner'] = 'tie'
+
+        cur_dict['capability'] = reference['capability']
+        cur_dict['model_a'] = reference['answer1']
+        cur_dict['model_b'] = reference['answer2']
+        cur_dict['prediction1'] = reference['prediction1']
+        cur_dict['prediction2'] = reference['prediction2']
+
+        matches.append(cur_dict)
+
+    ### ---------- Add Style Metadata ---------- ###
+    matches = get_element_counts(
+        data=matches,
+        column='prediction1',
+        suffix='_a',
+    )
+    matches = get_element_counts(
+        data=matches,
+        column='prediction2',
+        suffix='_b',
+    )
+
+    results['matches'] = matches
+    # results["details"] = output
+
+    return results
--- a/opencompass/datasets/subjective/compass_arena.py
+++ b/opencompass/datasets/subjective/compass_arena.py
@ -4,6 +4,8 @@ from collections import defaultdict

 from datasets import Dataset

+from opencompass.datasets.subjective.compass_arena_subjective_bench import \
+    get_element_counts
 from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET

 from .subjective_cmp import SubjectiveCmpDataset
@ -48,7 +50,8 @@ def check_position_bias(judged_answers, references, banned_choice=['C']):
            }
        else:
            first_judge = position_bias_dict[question_hash]['judge']
-            if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice:
+            if (judge == first_judge and first_judge not in banned_choice
+                    and judge not in banned_choice):
                # If second choice is same with first choice, there has position bias.
                position_bias_flag += 1
    return position_bias_flag
@ -63,10 +66,12 @@ def post_process_compassarena(item):


@DICT_POSTPROCESSORS.register_module('compassarena')
-def compassarena_postprocess(output: dict,
-                             output_path: str,
-                             summary_type='single',
-                             check_pos_bias=True) -> dict:
+def compassarena_postprocess(
+    output: dict,
+    output_path: str,
+    summary_type='single',
+    check_pos_bias=True,
+) -> dict:
    judged_answers, references = get_judgeanswer_and_reference(
        output, output_path, post_process_compassarena)

@ -116,3 +121,65 @@ def compassarena_postprocess(output: dict,
    results = win_model2
    results['details'] = output
    return results
+
+
+@DICT_POSTPROCESSORS.register_module('compassarena_bradleyterry')
+def compassarena_bradleyterry_postprocess(
+    output: dict,
+    output_path: str,
+    count_ties: bool = True,
+) -> dict:
+    judged_answers, references = get_judgeanswer_and_reference(
+        result=output,
+        filename=output_path,
+        post_process=post_process_compassarena,
+    )
+
+    if 'prediction1' not in references[0]:
+        raise ValueError(
+            'prediction1 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
+        )
+
+    if 'prediction2' not in references[0]:
+        raise ValueError(
+            'prediction2 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
+        )
+
+    results = {}
+    matches = []
+    for judged_answer, reference in zip(judged_answers, references):
+        cur_dict = {}
+
+        if judged_answer.strip() == 'A':
+            cur_dict['winner'] = 'model_a'
+        elif judged_answer.strip() == 'B':
+            cur_dict['winner'] = 'model_b'
+        elif judged_answer.strip() == 'C' and count_ties:
+            cur_dict['winner'] = 'tie'
+        else:
+            continue
+
+        cur_dict['capability'] = reference['capability']
+        cur_dict['model_a'] = reference['answer1']
+        cur_dict['model_b'] = reference['answer2']
+        cur_dict['prediction1'] = reference['prediction1']
+        cur_dict['prediction2'] = reference['prediction2']
+
+        matches.append(cur_dict)
+
+    ### ---------- Add Style Metadata ---------- ###
+    matches = get_element_counts(
+        data=matches,
+        column='prediction1',
+        suffix='_a',
+    )
+    matches = get_element_counts(
+        data=matches,
+        column='prediction2',
+        suffix='_b',
+    )
+
+    results['matches'] = matches
+    # results["details"] = output
+
+    return results
--- a/opencompass/datasets/subjective/wildbench.py
+++ b/opencompass/datasets/subjective/wildbench.py
@ -5,6 +5,8 @@ from collections import defaultdict

 from datasets import Dataset, DatasetDict

+from opencompass.datasets.subjective.compass_arena_subjective_bench import \
+    get_element_counts
 from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
 from opencompass.utils import get_data_path

@ -204,7 +206,7 @@ def parse_conversation(conversation):
    last_query = conversation[-1]['content']
    chat_round.append({
        'role': role_dict[conversation[-1]['role']],
-        'content': conversation[-1]['content']
+        'content': conversation[-1]['content'],
    })
    chat_round.append({'role': 'assistant', 'content': ''})

@ -249,7 +251,7 @@ class WildBenchDataset(BaseDataset):
                        'primary_tag': item['primary_tag'],
                        'secondary_tag': item['secondary_tag'],
                        'question_id': item['session_id'],
-                    }
+                    },
                })
        dataset = Dataset.from_list(raw_data)
        return dataset
@ -267,13 +269,13 @@ task_group_new = {
    'Role playing': 'Creative Tasks',
    'Advice seeking': 'Information/Advice seeking',
    'Data Analysis': 'Math & Data Analysis',
-    'Others': 'Creative Tasks'
+    'Others': 'Creative Tasks',
 }


 def post_process_wildbench_pair(judgement: dict):
    judgement = judgement['prediction']
-    pattern = r'\"choice\": \"(.*?)\"'
+    pattern = r"\"choice\": \"(.*?)\""
    matched_result = re.findall(pattern, judgement)
    if matched_result:
        return matched_result[0]
@ -283,7 +285,7 @@ def post_process_wildbench_pair(judgement: dict):

 def post_process_wildbench_single(judgement: dict):
    judgement = judgement['prediction']
-    pattern = r'\"score\": \"(.*?)\"'
+    pattern = r"\"score\": \"(.*?)\""
    matched_result = re.findall(pattern, judgement)
    try:
        score = float(matched_result[0])
@ -299,23 +301,36 @@ def post_process_wildbench_single(judgement: dict):


@DICT_POSTPROCESSORS.register_module('wildbench')
-def wildbench_postprocess(output: dict, output_path: str) -> dict:
+def wildbench_postprocess(
+    output: dict,
+    output_path: str,
+) -> dict:
+
    judged_answers, references = get_judgeanswer_and_reference(
-        output, output_path, post_process_wildbench_pair)
+        result=output,
+        filename=output_path,
+        post_process=post_process_wildbench_pair,
+    )
+
+    if 'base_models' in references[0]:
+        base_models = references[0]['base_models']
+    else:
+        base_models = ['HaiKu', 'gpt4-turbo', 'llama-2-70b-chat-hf']
+
+    if isinstance(base_models, str):
+        base_models = [base_models]

    win_base_model = defaultdict(float)
    win_compare_model = defaultdict(float)
    categories = defaultdict(float)

    score_mapping = {'A++': 1, 'A+': 0.5, 'A=B': 0, 'B+': -0.5, 'B++': -1}
-    for prediction, reference in zip(judged_answers, references):
-        if prediction not in score_mapping:
+    for judged_answer, reference in zip(judged_answers, references):
+        if judged_answer not in score_mapping:
            continue

-        flag = 1 if reference['answer1'] in [
-            'HaiKu', 'gpt4-turbo', 'llama-2-70b-chat-hf'
-        ] else -1
-        score_1 = score_mapping[prediction] * flag
+        flag = 1 if reference['answer1'] in base_models else -1
+        score_1 = score_mapping[judged_answer] * flag
        score_2 = -score_1

        tags = [reference['primary_tag']] + reference['secondary_tag']
@ -325,11 +340,11 @@ def wildbench_postprocess(output: dict, output_path: str) -> dict:
            categories[task_group_new[tag]] += 1

    for capability in categories:
-        win_base_model[capability] = win_base_model[capability] / categories[
-            capability] * 100
+        win_base_model[capability] = (win_base_model[capability] /
+                                      categories[capability] * 100)
        win_base_model[capability] = round(win_base_model[capability], 2)
-        win_compare_model[capability] = win_compare_model[
-            capability] / categories[capability] * 100
+        win_compare_model[capability] = (win_compare_model[capability] /
+                                         categories[capability] * 100)
        win_compare_model[capability] = round(win_compare_model[capability], 2)

    # Calculating the mean of the values
@ -341,3 +356,82 @@ def wildbench_postprocess(output: dict, output_path: str) -> dict:
    results = win_compare_model
    results['details'] = output
    return results
+
+
+@DICT_POSTPROCESSORS.register_module('wildbench_bradleyterry')
+def wildbench_bradleyterry_postprocess(
+    output: dict,
+    output_path: str,
+) -> dict:
+
+    judged_answers, references = get_judgeanswer_and_reference(
+        result=output,
+        filename=output_path,
+        post_process=post_process_wildbench_pair,
+    )
+
+    if 'prediction1' not in references[0]:
+        raise ValueError(
+            'prediction1 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
+        )
+
+    if 'prediction2' not in references[0]:
+        raise ValueError(
+            'prediction2 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
+        )
+
+    score_mapping = {
+        'A++': 'model_a',
+        'A+': 'model_a',
+        'A=B': 'tie',
+        'B+': 'model_b',
+        'B++': 'model_b',
+    }
+
+    results = {}
+    matches = []
+    for judged_answer, reference in zip(judged_answers, references):
+        cur_dict = {}
+
+        if judged_answer in score_mapping:
+            cur_dict['winner'] = score_mapping[judged_answer]
+        else:
+            # cur_dict["winner"] = (
+            #     "tie"  # Count match as tie if judge answer cannot be parsed.
+            # )
+
+            # Skip if judge answer cannot be parsed
+            print('Judge answer cannot be parsed. Skipping record...')
+            continue
+
+        cur_dict['primary_tag'] = reference['primary_tag']
+        # Extract first tag from list and set as categorical level.
+        # Can be used as categorical variable in Bradley-Terry model
+        cur_dict['secondary_tag'] = (reference['secondary_tag'][0]
+                                     if len(reference['secondary_tag']) > 0
+                                     else 'Others')
+        # Keep original secondary tag list for reference
+        cur_dict['secondary_tags'] = reference['secondary_tag']
+        cur_dict['model_a'] = reference['answer1']
+        cur_dict['model_b'] = reference['answer2']
+        cur_dict['prediction1'] = reference['prediction1']
+        cur_dict['prediction2'] = reference['prediction2']
+
+        matches.append(cur_dict)
+
+    ### ---------- Add Style Metadata ---------- ###
+    matches = get_element_counts(
+        data=matches,
+        column='prediction1',
+        suffix='_a',
+    )
+    matches = get_element_counts(
+        data=matches,
+        column='prediction2',
+        suffix='_b',
+    )
+
+    results['matches'] = matches
+    # results["details"] = output
+
+    return results
--- a/opencompass/openicl/icl_evaluator/lm_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py
@ -2,7 +2,7 @@
 import os.path as osp
 import random
 import re
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union

 import mmengine
 from datasets import Dataset
@ -32,6 +32,7 @@ def order_preds_and_record_references(
    infer_order: List,
    seed: int = 666,
    keep_preds: bool = False,
+    base_model_abbrs: List[str] = None,
 ):
    """Order predictions based on args and recording regrading references.

@ -41,6 +42,7 @@ def order_preds_and_record_references(
        infer_order (str, optional): The mode of inference order.
        seed (int, optional): Random seed.
        keep_preds (bool, optional): Whether to save model predictions in references. This will be available as input in postprocessor. Defaults to False.
+        base_model_abbrs (List[str], optional): List of base models passed from dataset cfg.
    """
    random.seed(seed)
    list_of_preds = [[] for _ in range(len(predictions))]
@ -56,6 +58,12 @@ def order_preds_and_record_references(
            if keep_preds:
                references[i][f'prediction{j+1}'] = preds[j][0]

+        if base_model_abbrs is not None:
+            if isinstance(base_model_abbrs, str):
+                base_model_abbrs = [base_model_abbrs]
+
+            references[i]['base_models'] = base_model_abbrs
+
    if infer_order == 'double':
        assert len(predictions) == 2
        list_of_preds = [
@ -77,6 +85,7 @@ def order_preds_and_record_references(

            reversed_references.append(reversed_item)
        references += reversed_references
+
    return list_of_preds, references


@ -164,11 +173,24 @@ class LMEvaluator:
                    {} for _ in range(len(predictions[0]['model_preds']))
                ]

+            base_model_abbrs = None
+            if self.dataset_cfg is not None:
+                if 'base_models' in self.dataset_cfg:
+                    base_models = self.dataset_cfg['base_models']
+
+                    if isinstance(base_models, Dict):
+                        base_models = [base_models]
+
+                    base_model_abbrs = [
+                        base_mdl['abbr'] for base_mdl in base_models
+                    ]
+
            predictions, references = order_preds_and_record_references(
                predictions=predictions,
                references=references,
                infer_order=infer_order,
                keep_preds=self.keep_predictions,
+                base_model_abbrs=base_model_abbrs,
            )

            # calculate dupicated predictions numbers
--- a/opencompass/summarizers/subjective/compass_arena_bradley_terry.py
+++ b/opencompass/summarizers/subjective/compass_arena_bradley_terry.py
@ -757,16 +757,17 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):

        # if rating_system == "bradleyterry":
        if with_control_vars:
-            bootstrap_df, bootstrap_coef = compute_bootstrap_style_control(
+            elo_rating_final, coef_final = compute_style_control(
                df=matches_df,
-                num_round=num_bootstrap,
                baseline_model=base_model,
                normalize_style_features=self.normalize_style_features,
                control_variables=groups,
                odds_ratio=self.odds_ratio,
            )
-            elo_rating_final, coef_final = compute_style_control(
+
+            bootstrap_df, bootstrap_coef = compute_bootstrap_style_control(
                df=matches_df,
+                num_round=num_bootstrap,
                baseline_model=base_model,
                normalize_style_features=self.normalize_style_features,
                control_variables=groups,
@ -953,7 +954,9 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
            time_str (str, optional): Timestamp for file suffix. Defaults to
            datetime.now().strftime('%Y%m%d_%H%M%S').
        """
+        all_scores_df_list = []
        all_scores = {}
+        all_scores_ctrl_coefs = {}
        for judge_model in self.judge_models:
            control_coefficients = {}
            leaderboard_tables = {}
@ -986,7 +989,7 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
                    print('-' * 10 +
                          f"{dataset_abbr + ':' + base_model_abbr}\n" +
                          '-' * 10)
-                    # print(cur_table_df)
+                    print(cur_table_df)
                    print(cur_ctrl_coefs)

            leaderboard_tables = self.flip_dict_levels(leaderboard_tables)
@ -1002,17 +1005,43 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
            )

            # Fit another BT model with the first base_model and combining matches from all datasets
-            all_scores_df, all_scores_ctrl_coefs = self._calculate_ratings(
-                matches=all_matches,
-                base_model=list(base_models)[0],
-                groups=self.groups,
-            )
+            cur_judge_all_scores_df, cur_judge_all_scores_ctrl_coefs = (
+                self._calculate_ratings(
+                    matches=all_matches,
+                    base_model=list(base_models)[0],
+                    groups=self.groups,
+                ))
+            cur_judge_all_scores_df['judge'] = judge_abbr
+
+            all_scores_df_list.append(cur_judge_all_scores_df)

            all_scores[judge_abbr] = pd.Series(
-                all_scores_df['rating'],
-                index=all_scores_df['model_name'],
+                cur_judge_all_scores_df['rating'],
+                index=cur_judge_all_scores_df['model_name'],
            ).to_dict()

+            all_scores_ctrl_coefs[judge_abbr] = cur_judge_all_scores_ctrl_coefs
+
+        all_scores_df = pd.concat(all_scores_df_list)
+
+        output_path_all_scores_df = osp.join(
+            self.work_dir, 'summary', f'summary_{time_str}_all_scores_df.csv')
+        output_path_all_scores = osp.join(
+            self.work_dir, 'summary', f'summary_{time_str}_all_scores.json')
+        output_path_all_scores_ctrl_coefs = osp.join(
+            self.work_dir, 'summary',
+            f'summary_{time_str}_all_scores_ctrl_coefs.json')
+
+        all_scores_df.to_csv(output_path_all_scores_df)
+
+        with open(output_path_all_scores, 'w', encoding='utf-8') as f:
+            json.dump(all_scores, f, ensure_ascii=False, indent=4)
+
+        with open(output_path_all_scores_ctrl_coefs, 'w',
+                  encoding='utf-8') as f:
+            json.dump(all_scores_ctrl_coefs, f, ensure_ascii=False, indent=4)
+
+        print(f'{all_scores_df=}')
        print(f'{all_scores=}')
        print(f'{all_scores_ctrl_coefs=}')