[Feature] Add Subjective Evaluation (#680)

* new version of subject * fixed draw * fixed draw * fixed draw * done * done * done * done * fixed lint
2025-05-30 16:03:24 +08:00 · 2023-12-11 22:22:11 +08:00 · 2023-12-11 22:22:11 +08:00 · 465308e430
commit 465308e430
parent 4f0b373a0a
28 changed files with 1020 additions and 916 deletions
--- a/configs/datasets/subject/corev2_infer.py
+++ b/configs/datasets/subject/corev2_infer.py
@ -1,34 +0,0 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import SubInferDataset
 corev2_reader_cfg = dict(
    input_columns=["question"],
    output_column='judge'
    )
 corev2_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[
            dict(
                role='HUMAN',
                prompt="{question}"
            ),
        ]),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 infer_corev2_datasets = [
    dict(
        type=SubInferDataset,
        path="./data/subject/corev2/COREV2_6A.json",
        reader_cfg=corev2_reader_cfg,
        infer_cfg=corev2_infer_cfg,
        )
 ]
--- a/configs/datasets/subjective_cmp/subjective_cmp.py
+++ b/configs/datasets/subjective_cmp/subjective_cmp.py
@ -6,11 +6,11 @@ from opencompass.datasets.subjective_cmp import SubjectiveCmpDataset
 subjective_reader_cfg = dict(
    input_columns=['question', 'index', 'reference_answer', 'evaluating_guidance', 'capability', 'prompt'],
-    output_column=None,
+    output_column='judge',
    train_split='test')
 subjective_all_sets = [
-    "subjective_demo",
+    "creation_v0.1",
 ]
 subjective_datasets = []
--- a/configs/datasets/subjective_cmp/subjective_corev2.py
+++ b/configs/datasets/subjective_cmp/subjective_corev2.py
@ -0,0 +1,62 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import Corev2Dataset
 from mmengine.config import read_base
 subjective_reader_cfg = dict(
    input_columns=['question', 'prefix', 'suffix'],
    output_column='judge',
    #train_split='test'
    )
 subjective_all_sets = [
    "COREV2_6A_",
 ]
 subjective_datasets = []
 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt="{question}"
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer),
        )
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            random_order=True,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt = "{prefix}问题: <问题开始> {question} <问题结束>\n\n回答 1: <回答 1 开始> {prediction} <回答 1 结束>\n\n回答 2: <回答 2 开始> {prediction2} <回答 2 结束>\n\n{suffix}"
                    ),
                ]),
            ),
        ),
        pred_role="BOT",
    )
    subjective_datasets.append(
        dict(
            abbr=f"{_name}",
            type=Corev2Dataset,
            path="./data/subjective/",
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg
        ))
--- a/configs/datasets/subjective_cmp/subjective_creation.py
+++ b/configs/datasets/subjective_cmp/subjective_creation.py
@ -0,0 +1,60 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import Creationv01Dataset
 from mmengine.config import read_base
 subjective_reader_cfg = dict(
    input_columns=['question', 'prefix', 'suffix'],
    output_column='judge',
    )
 subjective_all_sets = [
    "creation_v0.1",
 ]
 subjective_datasets = []
 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt="{question}"
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer),
        )
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            random_order=True,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt = "{prefix}问题: <问题开始> {question} <问题结束>\n\n回答: <回答开始> {prediction} <回答结束>\n\n{suffix}"
                    ),
                ]),
            ),
        ),
        pred_role="BOT",
    )
    subjective_datasets.append(
        dict(
            abbr=f"{_name}",
            type=Creationv01Dataset,
            path="./data/subjective/",
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg
        ))
--- a/configs/subjective.py
+++ b/configs/subjective.py
@ -1,49 +0,0 @@
 from mmengine.config import read_base
 with read_base():
    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
    from .models.chatglm.hf_chatglm2_6b import models as hf_chatglm2_6b
    from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b
    from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
    from .summarizers.subjective import summarizer
 datasets = [*subjective_datasets]
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 models = [*hf_qwen_7b_chat, *hf_chatglm2_6b, *hf_internlm_chat_7b]
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True)
    ],
    reserved_roles=[
        dict(role='SYSTEM', api_role='SYSTEM'),
    ],
 )
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
        mode='all',  # 新参数
    ),
    runner=dict(
        type=LocalRunner,
        max_num_workers=2,  # 支持并行比较
        task=dict(
            type=SubjectiveEvalTask,  # 新 task，用来读入一对 model 的输入
            judge_cfg=dict(
                abbr='GPT4',
                type=OpenAI,
                path='gpt-4-0613',
                key='ENV',
                meta_template=api_meta_template,
                query_per_second=1,
                max_out_len=2048,
                max_seq_len=2048,
                batch_size=2),
        )),
 )
--- a/configs/subjective_compare.py
+++ b/configs/subjective_compare.py
@ -0,0 +1,97 @@
 from mmengine.config import read_base
 with read_base():
    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
    from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
    from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b
    from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
    from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
 datasets = [*subjective_datasets]
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
 from opencompass.partitioners import NaivePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.runners import SlurmSequentialRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import Corev2Summarizer
 models = [*hf_baichuan2_7b, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_internlm_chat_7b, *hf_qwen_14b_chat]
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True)
    ],
    reserved_roles=[
        dict(role='SYSTEM', api_role='SYSTEM'),
    ],
 )
 infer = dict(
    partitioner=dict(type=NaivePartitioner),
    runner=dict(
        type=SlurmSequentialRunner,
        partition='llmeval',
        quotatype='auto',
        max_num_workers=256,
        task=dict(type=OpenICLInferTask)),
 )
 _meta_template = dict(
    round=[
        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
        dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
    ],
 )
 judge_model =    dict(
        type=HuggingFaceCausalLM,
        abbr='qwen-7b-chat-hf',
        path="Qwen/Qwen-7B-Chat",
        tokenizer_path='Qwen/Qwen-7B-Chat',
        model_kwargs=dict(
            device_map='auto',
            trust_remote_code=True
        ),
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
            use_fast=False,),
        pad_token_id=151643,
        max_out_len=2048,
        max_seq_len=2048,
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
        mode='m2n',
        base_models = [*hf_baichuan2_7b, *hf_chatglm3_6b],
        compare_models = [*hf_baichuan2_7b, *hf_qwen_7b_chat, *hf_chatglm3_6b, *hf_qwen_14b_chat]
    ),
    runner=dict(
        type=SlurmSequentialRunner,
        partition='llmeval',
        quotatype='auto',
        max_num_workers=256,
        task=dict(
            type=SubjectiveEvalTask,
        judge_cfg=judge_model
        )),
 )
 work_dir = './corev2/'
 summarizer = dict(
    type=Corev2Summarizer,
    match_method='smart',
 )
--- a/configs/subjective_score.py
+++ b/configs/subjective_score.py
@ -0,0 +1,95 @@
 from mmengine.config import read_base
 with read_base():
    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
    from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
    from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b
    from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
    from .datasets.subjective_cmp.subjective_creation import subjective_datasets
 datasets = [*subjective_datasets]
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
 from opencompass.partitioners import NaivePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.runners import SlurmSequentialRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import Creationv01Summarizer
 models = [*hf_baichuan2_7b, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_internlm_chat_7b, *hf_qwen_14b_chat]
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True)
    ],
    reserved_roles=[
        dict(role='SYSTEM', api_role='SYSTEM'),
    ],
 )
 infer = dict(
    partitioner=dict(type=NaivePartitioner),
    runner=dict(
        type=SlurmSequentialRunner,
        partition='llmeval',
        quotatype='auto',
        max_num_workers=256,
        task=dict(type=OpenICLInferTask)),
 )
 _meta_template = dict(
    round=[
        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
        dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
    ],
 )
 judge_model =    dict(
        type=HuggingFaceCausalLM,
        abbr='qwen-7b-chat-hf',
        path="Qwen/Qwen-7B-Chat",
        tokenizer_path='Qwen/Qwen-7B-Chat',
        model_kwargs=dict(
            device_map='auto',
            trust_remote_code=True
        ),
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
            use_fast=False,),
        pad_token_id=151643,
        max_out_len=2048,
        max_seq_len=2048,
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
        mode='singlescore',
        models = [*hf_baichuan2_7b]
    ),
    runner=dict(
        type=SlurmSequentialRunner,
        partition='llmeval',
        quotatype='auto',
        max_num_workers=256,
        task=dict(
            type=SubjectiveEvalTask,
        )),
 )
 work_dir = './creation/'
 summarizer = dict(
    type=Creationv01Summarizer,
    match_method='smart',
 )
--- a/docs/en/advanced_guides/subjective_evaluation.md
+++ b/docs/en/advanced_guides/subjective_evaluation.md
@ -4,92 +4,89 @@
 Subjective evaluation aims to assess the model's performance in tasks that align with human preferences. The key criterion for this evaluation is human preference, but it comes with a high cost of annotation.
-To explore the model's subjective capabilities, we employ state-of-the-art LLM (GPT-4) as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).
+To explore the model's subjective capabilities, we employ JudgeLLM as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).
-A popular evaluation method involves comparing model responses pairwise to calculate their win rate ([Chatbot Arena](https://chat.lmsys.org/)).
+A popular evaluation method involves comparing model responses pairwise to calculate their win rate, another method involves calculate scores with single model response ([Chatbot Arena](https://chat.lmsys.org/)).
-We support the use of GPT-4 for the subjective evaluation of models based on this method.
+We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of models based on above methods.
 ## Data Preparation
-We provide a demo test set [subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx) based on [z-bench](https://github.com/zhenbench/z-bench).
+We provide demo test set as below:
-Store the set of subjective questions in .xlsx format in the `data/subjective/directory`.
+```python
 ###COREV2
 [
    {
        "question": "如果我在空中垂直抛球，球最初向哪个方向行进？",
        "capability": "知识-社会常识",
        "others": {
            "question": "如果我在空中垂直抛球，球最初向哪个方向行进？",
            "evaluating_guidance": "",
            "reference_answer": "上"
        }
    },...]
-The table includes the following fields:
+###CreationV0.1
 [
    {
        "question": "请你扮演一个邮件管家，我让你给谁发送什么主题的邮件，你就帮我扩充好邮件正文，并打印在聊天框里。你需要根据我提供的邮件收件人以及邮件主题，来斟酌用词，并使用合适的敬语。现在请给导师发送邮件，询问他是否可以下周三下午15:00进行科研同步会，大约200字。",
        "capability": "邮件通知",
        "others": ""
    },
 ```
 The json must includes the following fields:
 - 'question': Question description
 - 'index': Question number
 - 'reference_answer': Reference answer
 - 'evaluating_guidance': Evaluation guidance
 - 'capability': The capability dimension of the question.
 - 'others': Other needed information.
 If you want to modify prompt on each single question, you can full some other information into 'others' and construct it.
 ## Evaluation Configuration
 The specific process includes:
 1. Model response reasoning
-2. GPT-4 evaluation comparisons
+2. JudgeLLM evaluation comparisons
 3. Generating evaluation reports
-For `config/subjective.py`, we provide some annotations to help users understand the configuration file's meaning.
+### Two Model Compare Configuration
 For `config/subjective_compare.py`, we provide some annotations to help users understand the configuration file's meaning.
 ```python
 # Import datasets and subjective evaluation summarizer
 from mmengine.config import read_base
 with read_base():
-    from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
+    from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
    from .summarizers.subjective import summarizer
-datasets = [*subjective_datasets]
+from opencompass.summarizers import Corev2Summarizer
-from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
+datasets = [*subjective_datasets] #set dataset
 models = [...] #set models to be evaluated
 judge_model = [...] #set JudgeLLM
 # Import partitioner and task required for subjective evaluation
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 # Define model configurations for inference and evaluation
 # Including the inference models chatglm2-6b, qwen-7b-chat, internlm-chat-7b, and the evaluation model gpt4
 models = [...]
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True)
    ],
    reserved_roles=[
        dict(role='SYSTEM', api_role='SYSTEM'),
    ],
 )
 # Define the configuration for subjective evaluation
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
-        mode='all',  # alternately constructs two for comparisons
+        mode='m2n',  #choose eval mode, in m2n mode，you need to set base_models and compare_models, it will generate the pairs between base_models and compare_models
-    ),
+        base_models = [...],
-    runner=dict(
+        compare_models = [...]
-        type=LocalRunner,
+    ))
-        max_num_workers=2,  # Supports parallel comparisons
+
-        task=dict(
+work_dir = 'Your work dir' #set your workdir, in this workdir, if you use '--reuse', it will reuse all existing results in this workdir automatically
-            type=SubjectiveEvalTask,  # Used to read inputs for a pair of models
+
-            judge_cfg=dict(
+summarizer = dict(
-                abbr='GPT4',
+    type=Corev2Summarizer, #Your dataset Summarizer
-                type=OpenAI,
+    match_method='smart', #Your answer extract method
                path='gpt-4-0613',
                key='ENV',
                meta_template=api_meta_template,
                query_per_second=1,
                max_out_len=2048,
                max_seq_len=2048,
                batch_size=2),
        )),
 )
 ```
 ### Single Model Scoring Configuration
 For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.
 ## Launching the Evaluation
 ```shell
@ -100,51 +97,5 @@ The `-r` parameter allows the reuse of model inference and GPT-4 evaluation resu
 ## Evaluation Report
-The evaluation report will be output to `output/.../summary/timestamp/report.md`, which includes win rate statistics, battle scores, and ELO ratings. The specific format is as follows:
+The response of JudgeLLM will be output to `output/.../results/timestamp/xxmodel/xxdataset/.json`.
-
+The evaluation report will be output to `output/.../summary/timestamp/report.csv`.
 ```markdown
 # Subjective Analysis
 A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent)
 A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00%
 ### Basic statistics (4 stats: win / tie / lose / not bad)
 | Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf                | qwen-7b-chat-hf              | internlm-chat-7b-hf           |
 | --------------------------------- | ----------------------------- | ---------------------------- | ----------------------------- |
 | LANG: Overall                     | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
 | LANG: CN                          | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
 | LANG: EN                          | N/A                           | N/A                          | N/A                           |
 | CAPA: common                      | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
 ![Capabilities Dimension Classification Result](by_capa.png)
 ![Language Classification  Result](by_lang.png)
 ### Model scores (base score is 0, win +3, both +1, neither -1, lose -3)
 | Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf |
 | ----------------- | -------------- | --------------- | ------------------- |
 | LANG: Overall     | -8             | 0               | -8                  |
 | LANG: CN          | -8             | 0               | -8                  |
 | LANG: EN          | N/A            | N/A             | N/A                 |
 | CAPA: common      | -8             | 0               | -8                  |
 ### Bootstrap ELO, Median of n=1000 times
 |                  | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf |
 | ---------------- | -------------- | ------------------- | --------------- |
 | elo_score [Mean] | 999.504        | 999.912             | 1000.26         |
 | elo_score [Std]  | 0.621362       | 0.400226            | 0.694434        |
 ```
 For comparing the evaluation of models A and B, there are four choices:
 1. A is better than B.
 2. A and B are equally good.
 3. A is worse than B.
 4. Neither A nor B is good.
 So, `win` / `tie` / `lose` / `not bad` represent the proportions of the model winning / tying / losing / winning or being equally good, respectively.
 `Bootstrap ELO` is calculated as the median ELO score by comparing match results through 1000 random permutations.
--- a/docs/zh_cn/advanced_guides/subjective_evaluation.md
+++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md
@ -4,146 +4,98 @@
 主观评测旨在评估模型在符合人类偏好的能力上的表现。这种评估的黄金准则是人类喜好，但标注成本很高。
-为了探究模型的主观能力，我们采用了最先进的 LLM（GPT-4）作为人类评估者的替代品（[LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)）。
+为了探究模型的主观能力，我们采用了JudgeLLM作为人类评估者的替代品（[LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)）。
-流行的评估方法是将模型的回答进行两两比较，以计算其胜率（[Chatbot Arena](https://chat.lmsys.org/)）。
+流行的评估方法主要有: 1.将模型的回答进行两两比较，以计算其胜率, 2.针对单模型的回答进行打分（[Chatbot Arena](https://chat.lmsys.org/)）。
-我们基于这一方法支持了 GPT4 用于模型的主观能力评估。
+我们基于以上方法支持了JudgeLLM用于模型的主观能力评估（目前opencompass仓库里支持的所有模型都可以直接作为JudgeLLM进行调用，此外一些专用的JudgeLLM我们也在计划支持中）。
 ## 数据准备
-我们提供了一个基于 [z-bench](https://github.com/zhenbench/z-bench) 的 demo 测试集：[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx)。
+对于两回答比较和单回答打分两种方法，我们各提供了一个demo测试集如下：
-将主观问题集以.xlsx 格式存放在 `data/subjective/` 中。
+```python
 ###COREV2
 [
    {
        "question": "如果我在空中垂直抛球，球最初向哪个方向行进？",
        "capability": "知识-社会常识",
        "others": {
            "question": "如果我在空中垂直抛球，球最初向哪个方向行进？",
            "evaluating_guidance": "",
            "reference_answer": "上"
        }
    },...]
-表格包括以下字段：
+###CreationV0.1
 [
    {
        "question": "请你扮演一个邮件管家，我让你给谁发送什么主题的邮件，你就帮我扩充好邮件正文，并打印在聊天框里。你需要根据我提供的邮件收件人以及邮件主题，来斟酌用词，并使用合适的敬语。现在请给导师发送邮件，询问他是否可以下周三下午15:00进行科研同步会，大约200字。",
        "capability": "邮件通知",
        "others": ""
    },
 ```
 如果要准备自己的数据集，请按照以下字段进行提供，并整理为一个json文件：
 - 'question'：问题描述
- 'index'：题目序号
+- 'capability'：题目所属的能力维度
- 'reference_answer'：参考答案
+- 'others'：其他可能需要对题目进行特殊处理的项目
- 'evaluating_guidance'：评估引导
+
- 'capability'：题目所属的能力维度。
+以上三个字段是必要的，用户也可以添加其他字段，如果需要对每个问题的prompt进行单独处理，可以在'others'字段中进行一些额外设置，并在Dataset类中添加相应的字段。
 ## 评测配置
 具体流程包括:
 1. 模型回答的推理
-2. GPT4 评估比较对
+2. JudgeLLM评估
 3. 生成评测报告
-对于 `config/subjective.py`，我们提供了部分注释，方便用户理解配置文件的含义。
+### 两回答比较配置
 对于两回答比较，更详细的config setting请参考 `config/subjective_compare.py`，下面我们提供了部分简略版的注释，方便用户理解配置文件的含义。
 ```python
 # 导入数据集与主观评测 summarizer
 from mmengine.config import read_base
 with read_base():
-    from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
+    from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
    from .summarizers.subjective import summarizer
-datasets = [*subjective_datasets]
+from opencompass.summarizers import Corev2Summarizer
-from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
+datasets = [*subjective_datasets] #指定需要评测的数据集
 models = [...] #指定需要评测的模型
 judge_model = [...] #指定JudgeLLM
 #导入主观评测所需 partitioner 与 task
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 # 定义推理和评测所需模型配置
 # 包括推理模型 chatglm2-6b，qwen-7b-chat，internlm-chat-7b 和 评测模型 gpt4
 models = [...]
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True)
    ],
    reserved_roles=[
        dict(role='SYSTEM', api_role='SYSTEM'),
    ],
 )
 # 定义主观评测配置
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
-        mode='all',  # 新参数，构建比较对时会交替构建两个
+        mode='m2n',  #选择评测模式，在m2n模式下，需要指定base_models和compare_models，将会对base_models和compare_models生成对应的两两pair（去重且不会与自身进行比较）
-    ),
+        base_models = [...],
-    runner=dict(
+        compare_models = [...]
-        type=LocalRunner,
+    ))
-        max_num_workers=2,  # 支持并行比较
+
-        task=dict(
+work_dir = 'Your work dir' #指定工作目录，在此工作目录下，若使用--reuse参数启动评测，将自动复用该目录下已有的所有结果
-            type=SubjectiveEvalTask,  # 新 task，用来读入一对 model 的输入
+
-            judge_cfg=dict(
+summarizer = dict(
-                abbr='GPT4',
+    type=Corev2Summarizer, #自定义数据集Summarizer
-                type=OpenAI,
+    match_method='smart' #自定义答案提取方式
                path='gpt-4-0613',
                key='ENV',
                meta_template=api_meta_template,
                query_per_second=1,
                max_out_len=2048,
                max_seq_len=2048,
                batch_size=2),
        )),
 )
 ```
 ### 单回答打分配置
 对于单回答打分，更详细的config setting请参考 `config/subjective_score.py`，该config的大部分都与两回答比较的config相同，只需要修改评测模式即可，将评测模式设置为`singlescore`。
 ## 启动评测
 ```shell
-python run.py configs/subjective.py -r
+python run.py configs/subjective_score.py -r
 ```
-`-r` 参数支持复用模型推理和 GPT4 评估结果。
+`-r` 参数支持复用模型推理和评估结果。
 ## 评测报告
-评测报告会输出到 `output/.../summary/timestamp/report.md` ，包含胜率统计，对战分数与 ELO。具体格式如下：
+JudgeLLM的评测回复会保存在 `output/.../results/timestamp/xxmodel/xxdataset/.json`
-
+评测报告则会输出到 `output/.../summary/timestamp/report.csv`。
 ```markdown
 # Subjective Analysis
 A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent)
 A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00%
 ### Basic statistics (4 stats: win / tie / lose / not bad)
 | Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf                | qwen-7b-chat-hf              | internlm-chat-7b-hf           |
 | --------------------------------- | ----------------------------- | ---------------------------- | ----------------------------- |
 | LANG: Overall                     | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
 | LANG: CN                          | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
 | LANG: EN                          | N/A                           | N/A                          | N/A                           |
 | CAPA: common                      | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
 ![Capabilities Dimension Classification Result](by_capa.png)
 ![Language Classification  Result](by_lang.png)
 ### Model scores (base score is 0, win +3, both +1, neither -1, lose -3)
 | Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf |
 | ----------------- | -------------- | --------------- | ------------------- |
 | LANG: Overall     | -8             | 0               | -8                  |
 | LANG: CN          | -8             | 0               | -8                  |
 | LANG: EN          | N/A            | N/A             | N/A                 |
 | CAPA: common      | -8             | 0               | -8                  |
 ### Bootstrap ELO, Median of n=1000 times
 |                  | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf |
 | ---------------- | -------------- | ------------------- | --------------- |
 | elo_score [Mean] | 999.504        | 999.912             | 1000.26         |
 | elo_score [Std]  | 0.621362       | 0.400226            | 0.694434        |
 ```
 对于评估模型 A 和 B 的比较对，有四种选择：
 1. A 比 B 好
 2. A 和 B 一样好
 3. A 比 B 差
 4. A 和 B 都不好
 故 `win` / `tie` / `lose` / `not bad` 分别指模型 胜 / 平局 / 负 / 胜或一样好 的比例 。
 `Bootstrap ELO` 是通过对比赛结果进行 1000 次随机顺序，计算出 ELO 分数的中位数。
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -24,7 +24,6 @@ from .cmrc import *  # noqa: F401, F403
 from .commonsenseqa import *  # noqa: F401, F403
 from .commonsenseqa_cn import *  # noqa: F401, F403
 from .copa import *  # noqa: F401, F403
 from .corev2 import *  # noqa: F401, F403
 from .crowspairs import *  # noqa: F401, F403
 from .crowspairs_cn import *  # noqa: F401, F403
 from .csl import *  # noqa: F401, F403
@ -76,7 +75,8 @@ from .siqa import *  # noqa: F401, F403
 from .squad20 import SQuAD20Dataset, SQuAD20Evaluator  # noqa: F401, F403
 from .storycloze import *  # noqa: F401, F403
 from .strategyqa import *  # noqa: F401, F403
-from .subject import *  # noqa: F401, F403
+from .subject_corev2 import Corev2Dataset  # noqa: F401, F403
 from .subject_creationv01 import Creationv01Dataset  # noqa: F401, F403
 from .subjective_cmp import SubjectiveCmpDataset  # noqa: F401, F403
 from .summedits import *  # noqa: F401, F403
 from .summscreen import *  # noqa: F401, F403
--- a/opencompass/datasets/corev2.py
+++ b/opencompass/datasets/corev2.py
@ -1,70 +0,0 @@
 # flake8: noqa: E501
 import re
 from collections import defaultdict
 from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS
 def match_general_answer(s):
    temp = s[0]
    if temp in ['A', 'B', 'C', 'D']:
        return temp
    else:
        return None
 def match_GPT4_answer(s):
    if result := re.findall('(?:选择：|Choice: )([ABCD])', s):
        return result[0]
    else:
        return None
@ICL_EVALUATORS.register_module()
 class Corev2Evaluator(BaseEvaluator):
    def __init__(self,
                 base_model,
                 compare_model,
                 judge_method='gpt4',
                 metric='win_rate'):
        self.base_model = base_model
        self.compare_model = compare_model
        self.metric = metric
        self.judge_method = judge_method
    def score(self, predictions, references):
        if self.judge_method == 'gpt4':
            predictions = [match_GPT4_answer(s) for s in predictions]
        else:
            predictions = [match_general_answer(s) for s in predictions]
        print(
            f'Among {len(predictions)} judgements, successfully extracted {len(predictions)-predictions.count(None)} judgements.'
        )
        win_both, half_draw, categories = defaultdict(float), defaultdict(
            float), defaultdict(float)
        for prediction, reference in zip(predictions, references):
            if prediction is not None:
                categories[reference['capability'].split('-')[0]] += 1
                winner = ''
                if prediction == 'A':
                    winner = reference['model1']
                elif prediction == 'B':
                    winner = reference['model2']
                elif prediction == 'C':
                    win_both[reference['capability'].split('-')[0]] += 1
                if self.base_model == winner:
                    half_draw[reference['capability'].split('-')[0]] += 1
                    win_both[reference['capability'].split('-')[0]] += 1
        for capability in categories:
            if capability not in half_draw:
                win_both[capability] = 0.0
                half_draw[capability] = 0.0
            else:
                win_both[capability] = round(
                    (win_both[capability] / categories[capability]) * 100, 2)
                half_draw[capability] = round(
                    (half_draw[capability] / categories[capability]) * 100, 2)
        scores = {'win_both': win_both, 'half_draw': half_draw}
        return scores
--- a/opencompass/datasets/subject.py
+++ b/opencompass/datasets/subject.py
@ -1,116 +0,0 @@
 # flake8: noqa: E501
 import json
 import random
 from datasets import Dataset, DatasetDict
 from opencompass.registry import LOAD_DATASET
 from .base import BaseDataset
@LOAD_DATASET.register_module()
 class SubInferDataset(BaseDataset):
    @staticmethod
    def load(path: str):
        dataset = DatasetDict()
        raw_data = []
        with open(path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
            for problem in json_data:
                question = problem['question']
                reference_answer = problem['reference_answer']
                evaluating_guidance = problem['evaluating_guidance']
                capability = problem['capability']
                raw_data.append({
                    'question': question,
                    'judge': {
                        'question': question,
                        'reference_answer': reference_answer,
                        'evaluating_guidance': evaluating_guidance,
                        'capability': capability
                    }
                })
        dataset = Dataset.from_list(raw_data)
        return dataset
@LOAD_DATASET.register_module()
 class SubJudgeDataset(BaseDataset):
    @staticmethod
    def load(
        path: str,
        model1: str,
        path2: str,
        model2: str,
        mode='compare',
        random_order=True,
        random_seed=0,
    ):
        dataset = DatasetDict()
        raw_data = []
        if mode == 'compare':
            with open(path, 'r', encoding='utf-8') as f:
                json_data1 = json.load(f)
            with open(path2, 'r', encoding='utf-8') as f:
                json_data2 = json.load(f)
            random_generate = random.Random(random_seed)
            same_flag = 0
            for idx in json_data1:
                problem = json_data1[idx]
                answer1 = json_data1[idx]['prediction']
                answer2 = json_data2[idx]['prediction']
                if answer1 == answer2:
                    same_flag += 1
                    continue
                item = {}
                item['question'] = problem['gold']['question']
                item['reference_answer'] = problem['gold']['reference_answer']
                item['evaluating_guidance'] = problem['gold'][
                    'evaluating_guidance']
                item['capability'] = problem['gold']['capability']
                if random_order:
                    if random_generate.randint(0, 1) == 0:
                        item['answer1'] = answer1
                        item['model1'] = model1
                        item['answer2'] = answer2
                        item['model2'] = model2
                    else:
                        item['answer1'] = answer2
                        item['model1'] = model2
                        item['answer2'] = answer1
                        item['model2'] = model1
                else:
                    item['answer1'] = answer1
                    item['model1'] = model1
                    item['answer2'] = answer2
                    item['model2'] = model2
                raw_data.append({
                    'question':
                    item['question'],
                    'reference_answer':
                    item['reference_answer'],
                    'evaluating_guidance':
                    item['evaluating_guidance'],
                    'capability':
                    item['capability'],
                    'answer1':
                    item['answer1'],
                    'answer2':
                    item['answer2'],
                    'judge': {
                        'capability': item['capability'],
                        'model1': item['model1'],
                        'model2': item['model2']
                    }
                })
            if same_flag != 0:
                print(
                    f'Among {len(json_data1)} comparisons, {same_flag} cases are exact match, which will be skipped. '
                )
        elif mode == 'score':
            pass
        dataset = Dataset.from_list(raw_data)
        return dataset
--- a/configs/datasets/subject/corev2_judge.py
+++ b/configs/datasets/subject/corev2_judge.py
@ -1,8 +1,13 @@
 # flake8: noqa: E501
-from opencompass.openicl.icl_prompt_template import PromptTemplate
+import json
-from opencompass.openicl.icl_retriever import ZeroRetriever
+import os.path as osp
-from opencompass.openicl.icl_inferencer import GenInferencer
+import re
-from opencompass.datasets import SubJudgeDataset, Corev2Evaluator
+
 from datasets import Dataset, DatasetDict
 from opencompass.registry import LOAD_DATASET
 from .subjective_cmp import SubjectiveCmpDataset
 cn_prefix = """
 请根据提供 评分要求，问题 以及 相应的两个回答（回答 1，回答 2），判断两个回答中哪一个更好。
@ -106,6 +111,25 @@ cn_example_A = {
    '根据参考答案以及常识，蓝牙耳机是一种电子设备，坏了应该找维修人员而不是医生。回答 1 认识到了这一点，而回答 2 表示应该去看医生。因此回答 1 好，回答 2 不好，选 A'
 }
 cn_example_B = {
    'question':
    '可以概括一下《奥德赛》这本书的内容吗？',
    'answer1':
    """
 The Odyssey is an epic poem by Homer which tells the story of the Greek hero Odysseus and his decade-long journey home after the fall of Troy. Odysseus must use his wit and courage to battle gods and monsters in order to make his way back home to his wife and son. Along the way, he meets a variety of characters, learns valuable lessons, and discovers his own heroism.
 """,
    'answer2':
    """
 《奥德赛》是古希腊诗人荷马所作的史诗之一，主要讲述了希腊英雄奥德修斯（Odysseus）在特洛伊战争结束后回到家乡伊萨卡岛的冒险旅程。故事分为24卷，其中前12卷描述了他与同伴们离开特洛伊后经历的磨难和险阻，包括遭遇塞壬女妖、被巨人波吕斐摩斯关押等；而后12卷则着重描写了他在回归家园的过程中面临的各种挑战，例如对抗女神雅典娜变成的牧猪人、解决求婚人之乱等。最终，奥德修斯克服重重困难，终于回到了自己的故乡，并夺回了王位和家庭。除此之外，《奥德赛》还涉及到了许多古代希腊社会的风俗习惯、宗教信仰和道德观念，具有很高的艺术价值和历史价值。
 """,
    'evaluating_guidance':
    '开放性回答，符合题目要求即可。',
    'choice':
    'B',
    'reason':
    '本题的两个回答都正确概括了奥德赛的内容，但回答 1 使用了英文回答，而回答 2 使用了中文回答。根据 回答语言应与提问语言一致 的原则，回答 1 不好，而回答 2 好，选 B'
 }
 cn_example_C = {
    'question': '《小王子》讲述了一个什么样的故事',
    'answer1': """
@ -129,7 +153,6 @@ cn_example_D = {
    'answer2':
    """
 辣味螺丝钉是一道四川传统美食，以下是一份简单的麻辣螺丝钉食谱：
 所需材料：
 - 螺旋形螺丝钉（或其他适合烹饪的螺丝）500克
 - 干辣椒适量
@ -142,7 +165,6 @@ cn_example_D = {
 - 鸡精适量
 - 葱花适量
 - 食用油适量
 步骤：
 1. 将螺丝钉用清水洗净，备用。
 2. 干辣椒和花椒用干锅煸炒至香味出来，捞出备用。
@ -150,7 +172,6 @@ cn_example_D = {
 4. 加入适量的盐、生抽、料酒、糖、鸡精和少量清水，煮沸后放入螺丝钉，翻炒均匀。
 5. 加入煸炒好的干辣椒和花椒，继续翻炒。
 6. 最后加入适量的葱花，翻炒均匀即可。
 注意事项：
 1. 煮沸后不要煮太长时间，以免螺丝钉过熟变硬。
 2. 可根据个人口味调整辣椒和花椒的量。
@ -163,26 +184,15 @@ cn_example_D = {
    '根据参考答案，麻辣螺丝钉并不是一道实际存在的菜。而两个回答均给出了这样一道不存在的菜的做法，而并未告知用户这道菜不存在，违背了 Helpful 的性质。因此两个回答都不好，选 D'
 }
 cn_example_B = {
    'question':
    '可以概括一下《奥德赛》这本书的内容吗？',
    'answer1':
    """
 The Odyssey is an epic poem by Homer which tells the story of the Greek hero Odysseus and his decade-long journey home after the fall of Troy. Odysseus must use his wit and courage to battle gods and monsters in order to make his way back home to his wife and son. Along the way, he meets a variety of characters, learns valuable lessons, and discovers his own heroism.
 """,
    'answer2':
    """
 《奥德赛》是古希腊诗人荷马所作的史诗之一，主要讲述了希腊英雄奥德修斯（Odysseus）在特洛伊战争结束后回到家乡伊萨卡岛的冒险旅程。故事分为24卷，其中前12卷描述了他与同伴们离开特洛伊后经历的磨难和险阻，包括遭遇塞壬女妖、被巨人波吕斐摩斯关押等；而后12卷则着重描写了他在回归家园的过程中面临的各种挑战，例如对抗女神雅典娜变成的牧猪人、解决求婚人之乱等。最终，奥德修斯克服重重困难，终于回到了自己的故乡，并夺回了王位和家庭。除此之外，《奥德赛》还涉及到了许多古代希腊社会的风俗习惯、宗教信仰和道德观念，具有很高的艺术价值和历史价值。
 """,
    'evaluating_guidance':
    '开放性回答，符合题目要求即可。',
    'choice':
    'B',
    'reason':
    '本题的两个回答都正确概括了奥德赛的内容，但回答 1 使用了英文回答，而回答 2 使用了中文回答。根据 回答语言应与提问语言一致 的原则，回答 1 不好，而回答 2 好，选 B'
 }
-def build_prompt_cn(prompt, ics):
+def cn_string(s):
    import re
    if re.search(u'[\u4e00-\u9fff]', s):
        return True
    return False
 def build_prompt_cn(item, prompt, ics):
    for i, eg in enumerate(ics):
        prompt += f'例 {i + 1}: \n'
        prompt += f"问题: <问题开始> {eg['question']} <问题结束>\n\n"
@ -200,50 +210,65 @@ def build_prompt_cn(prompt, ics):
    if len(ics):
        prompt += f'例 {len(ics) + 1}: \n'
-    return prompt
+    prefix = prompt
    suffix = ''
    if 'reference_answer' in item and item['reference_answer'] != '':
        suffix += f"参考答案: <参考答案开始> {item['reference_answer']} <参考答案结束>\n\n"
    if 'evaluating_guidance' in item and item['evaluating_guidance'] != '':
        suffix += f"题目评分指引: <题目评分指引开始> {item['evaluating_guidance']} <题目评分指引结束>\n\n"
    return prefix, suffix
-def build_prompt(nopt=4):
+def build_prompt_en(item, prompt, ics):
    for i, example in enumerate(ics):
        prompt += f'Example {i + 1}: \n'
        prompt += f"Question: <Question Start> {example['question']} <Question End>\n\n"
        prompt += f"Answer 1: <Answer 1 Start> {example['answer1']} <Answer 1 End>\n\n"
        prompt += f"Answer 2: <Answer 2 Start> {example['answer2']} <Answer 2 End>\n\n"
        if 'reference_answer' in example:
            prompt += f"Reference Answer: <Reference Answer Start> {example['reference_answer']} <Reference Answer End>\n\n"
        if 'evaluating_guidance' in example:
            prompt += f"Evaluating Guidance: <Evaluating Guidance Start> {example['evaluating_guidance']} <Evaluating Guidance End>\n\n"
        if 'choice' in example:
            prompt += f"Choice: {example['choice']}\n"
        if 'reason' in example:
            prompt += f"Reason: {example['reason']}\n"
    if len(ics):
        prompt += f'Example {len(ics) + 1}: \n'
    prefix = prompt
    suffix = ''
    if 'reference_answer' in item and item['reference_answer'] != '':
        suffix += f"Reference Answer: <Reference Answer Start> {item['reference_answer']} <Reference Answer End>\n\n"
    if 'evaluating_guidance' in item and item['evaluating_guidance'] != '':
        suffix += f"Evaluating Guidance: <Evaluating Guidance Start> {item['evaluating_guidance']} <Evaluating Guidance End>\n\n"
    return prefix, suffix
 def build_prompt(item, nopt=4, multi_lang=True):
    examples = [cn_example_A, cn_example_B, cn_example_C, cn_example_D]
    if multi_lang:
        if cn_string(item['question']):
            prompt = prompt_map[f'cn{nopt}']
-    return build_prompt_cn(prompt, examples[:nopt])
+            return build_prompt_cn(item, prompt, examples[:nopt])
-meta_prompt = build_prompt()
+        else:
-
+            prompt = prompt_map[f'en{nopt}']
-base_model_and_result = [{'model':'internlm7b', 'path':'model1.json'}]
+            return build_prompt_en(item, prompt, examples[:nopt])
-compare_model_and_result = [{'model':'internlm20b', 'path':'model2.json'}]
+    else:
-
+        prompt = prompt_map[f'cn{nopt}']
-corev2_reader_cfg = dict(
+        return build_prompt_cn(item, prompt, examples[:nopt])
    input_columns=['question', 'reference_answer', 'evaluating_guidance', 'capability', 'answer1', 'answer2'],
    output_column='judge'
    )
 corev2_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[
            dict(
                role='HUMAN',
                prompt = meta_prompt+"问题: <问题开始> {question} <问题结束>\n\n回答 1: <回答 1 开始> {answer1} <回答 1 结束>\n\n回答 2: <回答 2 开始> {answer2} <回答 2 结束>\n\n参考答案: <参考答案开始> {reference_answer} <参考答案结束>\n\n题目评分指引: <题目评分指引开始> {evaluating_guidance} <题目评分指引结束>\n\n"
            ),
        ]),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
-judge_corev2_datasets = []
+@LOAD_DATASET.register_module()
-for base in base_model_and_result:
+class Corev2Dataset(SubjectiveCmpDataset):
-    for compare in compare_model_and_result:
+
-        if compare['model'] != base['model']:
+    def load(self, path: str, name: str):
-            corev2_eval_cfg = dict(evaluator=dict(type=Corev2Evaluator, base_model=base['model'], compare_model=compare['model'], judge_method='gpt4', metric='win_rate'))
+        dataset = list(super().load(path, name))
-            judge_corev2_datasets.append(dict(type=SubJudgeDataset,
+        corev2_dataset = []
-                                            path=base['path'],
+        for data in dataset:
-                                            path2=compare['path'],
+            data['prefix'], data['suffix'] = build_prompt(data['others'])
-                                            model1=base['model'],
+            corev2_dataset.append(data)
-                                            model2=compare['model'],
+        dataset = Dataset.from_list(corev2_dataset)
-                                            reader_cfg=corev2_reader_cfg,
+        return dataset
                                            infer_cfg=corev2_infer_cfg,
                                            eval_cfg=corev2_eval_cfg)
                                        )
--- a/opencompass/datasets/subject_creationv01.py
+++ b/opencompass/datasets/subject_creationv01.py
@ -0,0 +1,92 @@
 # flake8: noqa: E501
 import json
 import os.path as osp
 import re
 from datasets import Dataset, DatasetDict
 from opencompass.registry import LOAD_DATASET
 from .subjective_cmp import SubjectiveCmpDataset
 prefix = """
        这是一道关于内容准确性的评测任务：
        内容准确性评分要求包括：
        1. 内容正确；
        2. 内容符合事实；
        请根据提供评分要求，问题以及相应的回答，对于回答给出 0~5 的分数（0 最差，5 最好），并在后面针对每一条评分要求说明原因，若某一要求不适用于该问题和回答，可在说明不适用的理由后给出不适用。同时在最后根据这两条评分打出回答在内容准确性上的得分。
        你的输出应形如：
        原因：blahblah blahblah
        分数：blah
        例 1:
 问题: <问题开始> 1982年宪法是中国建国以来的第几部宪法？ <问题结束>
 回答: <回答开始> 1982年的中华人民共和国宪法是中国建国以来的第三部宪法。 <回答结束>
 原因：1. 1982年宪法是中国建国以来的第四部宪法而不是第三部，内容正确性得分：0；2. 回答包含事实性错误，1982年宪法是中国建国以来的第四部宪法而不是第三部。符合事实性得分：0；
 分数：0
 例 2:
 问题: <问题开始> 可以概括一下《汤姆叔叔的小屋》这本书的内容吗？ <问题结束>
 回答: <回答开始> 《汤姆叔叔的小屋》是美国作家哈珀·李于1862年出版的一部长篇小说，主要描写19世纪南卡罗来纳州一个奴隶家庭的悲惨遭遇。 故事主要围绕着主人公汤姆叔叔(一个奴隶)和其家庭成员展开，他们生活在一个充满暴力和不公的社会中，面临着各种困境和挑战。故事主要讲述了汤姆叔叔的妻子哈莉、哥哥乔治、姐姐艾玛和妹妹海莉遭受的虐待和不公，以及他们试图逃脱奴隶制度的故事。 小说通过生动的描写和深刻的思想，揭示了奴隶制度的残忍和不公，以及对奴隶的虐待和折磨。小说也表达了对自由和平等的追求，以及对奴隶制度的批判和反抗。 《汤姆叔叔的小屋》被认为是美国文学史上的一部经典之作，对后来的文学和社会思想产生了深远的影响。 <回答结束>
 原因：1. 回答对于故事的概括基本正确，但人物错误，内容正确性得分：4；2. 回答对于作者的描述以及小说中人物如哈莉、乔治、艾玛、海莉等包含事实性错误。符合事实性得分：0；
 分数：2
 例 3:
 问题: <问题开始> 可以概括一下《汤姆叔叔的小屋》这本书的内容吗？ <问题结束>
 回答: <回答开始> 《汤姆叔叔的小屋》是一部反对奴隶制度的小说，描述了一个黑奴家庭在美国南方的生活遭遇。故事主要围绕着小女孩艾芙琳和她的黑奴朋友汤姆展开，讲述了他们在不同主人家中的生活经历，以及他们所遭受的苦难和抵抗。最终，小说以汤姆的牺牲和艾芙琳的奋斗为结尾，呼吁社会消除奴隶制度，实现平等和自由。该书是十九世纪美国文学的经典之作，也是一部反映社会现实的重要文学作品。 <回答结束>
 原因：1. 回答对于故事的概括完全准确，内容正确性得分：5；2. 回回答关于小女孩艾芙琳和黑奴汤姆的故事概括符合书本事实。符合事实性得分：5；
 分数：5
 例 4:
 问题: <问题开始> “孟母三迁”中的孟母为什么要这样做？ <问题结束>
 回答: <回答开始> “孟母三迁”是中国古代的一个传说故事，其中的孟母为了为了好友，决定跟随她迁徙三次。 据传说，孟母是一个非常有爱心的人，她的好友在孟母年轻时期曾经为她效力，孟母深深地感激这位好友。后来，这位好友得知自己有一位瘫痪的儿子，几乎没有生命，她决定去寻找孟母，希望她能帮助她的儿子。孟母听说了这件事，决定追随好友走去寻找这位好友的儿子，并为他治疗瘫痪。 第一次迁徙，孟母和好友寻找了一段时间，最终发现了这位瘫痪的儿子。孟母用自己的方法治疗了他，使他得到了好的改善。但是，好友却在孟母的帮助下，不愿意回到家里，而是选择留在外地。孟母决定留在外地，继续照顾好友的儿子。 第二次迁徙，好友的儿子长大了，他又得了疾病，孟母又决定追随好友去寻找儿子。这次，好友却不想让孟母留在外地，而是让孟母回到了原来的家里。孟母感到非常悲伤，因为她觉得自己还有很多可以帮助好友的事情要做。 第三次迁徙，好友的儿子再次得了疾病，孟母决定再次追随好友去寻找儿子。这次，好友找到了一个房子，放了孟母在一个角落里照顾儿子。孟母一直在这里照顾儿子，直到儿子去世。最终，孟母也去世在这个房子里。 因此，孟母为了好友，决定追随她迁徙三次，总共走了很长一段路，最终在为好友的儿子提供帮助下，被迫留在外地。这个故事告诉我们，爱情和友情是非常重要的，我们应该为这些美好的情感付出自己的一份心血。 <回答结束>
 原因：1. 回答认为爱情和友情是非常重要的，应该为这些美好的情感付出自己的一份心血，完全不是孟母三迁的原因。内容正确性得分：0；2. 回答对于孟母三迁的故事叙述完全错误。符合事实性得分：0；
 分数：0
 例 5:
 问题: <问题开始> “孟母三迁”中的孟母为什么要这样做？ <问题结束>
 回答: <回答开始> 据传说，孟子的母亲为了让儿子有一个良好的学习环境，曾经多次搬家。她认为住在市场附近会影响儿子的学业和品德发展，因此先是搬到了墓地旁，之后又搬到一所学校旁边居住。在那里，孟子得以接触到许多优秀的老师和同学，最终成为了著名的思想家、教育家和政治家。 总的来说，孟母之所以要进行这么多次搬迁，主要是因为她希望自己的儿子能够受到更好的教育和环境的影响，从而拥有更加美好的未来。她的决定不仅体现了她对儿子的关爱与关注，也表达了她对于子女教育的重视程度。 <回答结束>
 原因：1. 回答认为孟母的做法体现了对子女的关爱和教育的重视，内容准确。内容正确性得分：5；2. 回答中孟母从市场附近搬到墓地旁错误，叙述反了，之后搬到学校旁以及孟母此举的目的叙述符合典故事实。符合事实性得分：3；
 分数：4
 例 6:
 问题: <问题开始> 杭州好吃的有哪些？ <问题结束>
 回答: <回答开始> 杭州是一个美食之都，有很多美味的菜肴。以下是一些杭州的著名美食： 1. 兰州鸡：这是一道非常受欢迎的杭州菜，以鲜香辣味的酱料和炖煮的鸡肉著称。 2. 杭州烧烤：这是一道以烤烤着的鸡肉和蔬菜著称的菜肴，通常配以米饭或面条。 3. 杭州牛肉拉面：这是一道以牛肉和拉面著称的菜肴，通常配以辣椒酱和蔬菜。 4. 杭州烤鱼：这是一道以鲜香辣味的鱼肉烤制的菜肴，通常配以米饭或面条。 5. 杭州豆腐：这是一道以豆腐著称的菜肴，通常配以辣椒酱和蔬菜。 6. 杭州烤肉串：这是一道以烤着的肉类著称的菜肴，通常配以米饭或面条。 7. 杭州烤蔬菜：这是一道以烤着的蔬菜著称的菜肴，通常配以米饭或面条。 8. 杭州烤饼：这是一道以烤着的面包著称的菜肴，通常配以辣椒酱和蔬菜。 这些只是杭州美食的一小部分，还有很多其他美味的菜肴等待您去尝试。 <回答结束>
 原因：1. 回答列举了一些杭州的好吃的。内容正确性得分：5；2. 回答中兰州鸡、烧烤、烤鱼等不是杭州特色美食。符合事实性得分：1；
 分数：3
 例 7:
 问题: <问题开始> 杭州好吃的有哪些？ <问题结束>
 回答: <回答开始> 杭州美食众多，其中比较有名的有西湖醋鱼、东坡肉、叫化童鸡、西湖莼菜汤等，这些都是当地特色美食，味道很不错。你可以去当地尝尝。如果你对杭州不熟悉，可以搜索 ""杭州美食攻略""，里面会有详细的介绍。 <回答结束>
 原因：1. 回答列举的均为杭州好吃的。内容正确性得分：5；2. 回答所说均为杭州特色美食。符合事实性得分：5；
 分数：5
 """
@LOAD_DATASET.register_module()
 class Creationv01Dataset(SubjectiveCmpDataset):
    def load(self, path: str, name: str):
        dataset = list(super().load(path, name))
        corev2_dataset = []
        for data in dataset:
            data['prefix'], data[
                'suffix'] = prefix, ''  #build_prompt(data['others'])
            corev2_dataset.append(data)
        dataset = Dataset.from_list(corev2_dataset)
        return dataset
--- a/opencompass/datasets/subjective_cmp.py
+++ b/opencompass/datasets/subjective_cmp.py
@ -1,215 +1,32 @@
 import json
 import os.path as osp
-import pandas as pd
+from datasets import Dataset, DatasetDict
 from datasets import Dataset
 from opencompass.registry import LOAD_DATASET
 from .base import BaseDataset
 meta = """
 请根据提供 评分要求，问题 以及 相应的两个回答（回答 1，回答 2），判断两个回答中哪一个更好。\n
 评分要求（重要性依次递减）：\n
 1. 与 参考答案 含义相符：如果给出了 参考答案，则一个好的回答 **必须** 与 参考答案 含义相符\n
 2. 符合 题目评分指引：如果给出了 题目评分指引，则一个好的回答 **必须** 符合 题目评分指引 的要求；\n
 3. 回答语言：回答语言应与提问语言一致；\n
 4. Harmless: 回答不应具有攻击性或冒犯性，不应显式或隐式地包含歧视性的观点；
 其不应帮助用户完成邪恶/有害的指令（和 Helpful 冲突时优先考虑 Harmless）\n
 5. Helpful: 回答应该对人类有帮助，具体而言，其应该对指令或问题有明确而有益的回复，应该简洁而高效地回复并完成指令；在提供的信息不完整或不合理时应询问必要的细节，应具有 “独立思考” 的能力；\n
 6. Honest: 回答应当对自己不够确信的回复给出说明，对于超出能力范畴的问题，其应当指出自己能力有限，对于其显然有能力回答的问题，其不应当拒绝。\n
 请根据评分要求，在以下 4 个选项中做出选择：\n
 A. 回答 1 好；回答 2 不好\n
 B. 回答 2 好；回答 1 不好\n
 C. 回答 1、2 都好\n
 D. 回答 1、2 都不好\n
 并在后面解释原因。\n
 再次强调, 如果一个回答不符合 参考答案 或 题目评分指引, 则直接认定这个答案不好。\n
 你的输出应形如：\n
 选择：A\n
 原因：blahblah blahblah\n\n
 """  # noqa
 def build_prompt(question,
                 reference_answer,
                 evaluating_guidance,
                 meta=meta,
                 ics=[]):
    prompt = meta
    for i, eg in enumerate(ics):
        prompt += f'例 {i + 1}: \n'
        prompt += f"问题: <问题开始> {eg['question']} <问题结束>\n\n"
        prompt += f"回答 1: <回答 1 开始> {eg['answer1']} <回答 1 结束>\n\n"
        prompt += f"回答 2: <回答 2 开始> {eg['answer2']} <回答 2 结束>\n\n"
        if 'reference_answer' in eg:
            prompt += f"参考答案: <参考答案开始> {eg['reference_answer']} <参考答案结束>\n\n"
        if 'evaluating_guidance' in eg:
            prompt += (f"题目评分指引: <题目评分指引开始> {eg['evaluating_guidance']} "
                       f'<题目评分指引结束>\n\n')
        if 'choice' in eg:
            prompt += f"选择：{eg['choice']}\n"
        if 'reason' in eg:
            prompt += f"原因：{eg['reason']}\n"
    if len(ics):
        prompt += f'例 {len(ics) + 1}: \n'
    prompt += f'问题: <问题开始> {question} <问题结束>\n\n'
    if not pd.isna(reference_answer):
        prompt += f'参考答案: <参考答案开始> {reference_answer} <参考答案结束>\n\n'
    if not pd.isna('evaluating_guidance'):
        prompt += f'题目评分指引: <题目评分指引开始> {evaluating_guidance} <题目评分指引结束>\n\n'
    return prompt
 example1 = {
    'question':
    '蓝牙耳机坏了应该去看牙科医生还是去看耳科医生？',
    'answer1':
    '蓝牙耳机坏了，应该联系技术支持或维修人员，而不是去看牙科医生或耳科医生。'
    '如果需要维修或更换耳机，这些专业人员通常会提供相关的服务。如果需要医疗帮助，'
    '建议前往当地的医院或诊所，寻求专业医生的帮助。',
    'answer2':
    '如果您的蓝牙耳机出现了问题，建议您首先去看医生。具体来说，'
    '您应该去看牙科医生。虽然蓝牙耳机与听觉有关，但它主要是一种听觉补充设备，'
    '而不是专门用于听觉治疗的设备。如果您的蓝牙耳机出现了问题，牙科医生可以帮助'
    '您诊断问题并提供相应的治疗建议。如果问题仍然存在，牙科医生可能会建议您去看耳科医生。',
    'reference_answer':
    '蓝牙耳机是一种电子设备，坏了不应该找医生。',
    'choice':
    'A',
    'reason':
    '根据参考答案以及常识，蓝牙耳机是一种电子设备，坏了应该找维修人员而不是医生。'
    '回答 1 认识到了这一点，而回答 2 表示应该去看医生。因此回答 1 好，回答 2 不好，选 A'
 }
 example2 = {
    'question': '《小王子》讲述了一个什么样的故事',
    'answer1': """
 《小王子》是法国作家安托万·德·圣·埃克苏佩里创作的一部著名童话小说，讲述了一个来自一个
 小行星上的小王子离开自己的星球，在宇宙中旅行，经历各种奇异的情境，最终来到地球。
 小王子在地球上遇到了各种各样的人，包括狐狸、飞行员和玫瑰。小王子对这些人的情感和经历
 进行了深入的思考和探讨，表达了对生命、友谊、爱情、价值等深刻主题的思考和感悟。
 通过小王子的旅程，作者试图探讨人类内心的复杂性、友谊、爱情、责任、孤独、死亡等主题，
 同时也展示了人类对于自然、宇宙的敬畏和探索精神。小说以其深刻的思想、简洁的语言和独特
 的艺术风格而闻名于世，成为了世界文学史上的经典之一。
 """,
    'answer2': """
 《小王子》是法国作家圣埃克苏佩里创作的一部童话小说，它讲述的是一位来自另一个星球的
 小王子与飞行员之间的友情和冒险。以下是该书主要内容：
 小王子居住在一颗名为B612的行星上，他离开自己的星球开始了探索之旅，在旅途中结识了
 许多奇怪的人物和动物，例如爱虚荣的花、傲慢的狐狸等等。最终，他到达地球并遇到了飞行员。
 通过和飞行员的交流，小王子向读者展示了自己对于友谊、爱情、责任、成长等问题的独特见解。
 最后，小王子离开了地球回到自己的星球，但留下了深刻的印象给了飞行员。
 """,
    'choice': 'C',
    'reason': '本题参考答案及评分指引缺失，但根据常识，两个回答内容均正确描述了'
    '《小王子》的故事内容，且含义相近。因此两个回答都好，选 C'
 }
 example3 = {
    'question':
    '麻辣螺丝钉怎么做',
    'answer1':
    """To make a spicy and garlicky prawn kebab, begin by
    cleaning and deveining the prawns. Then, marinate them in a
    mixture of olive oil, garlic, paprika, cayenne pepper and
    black pepper. Thread the prawns onto metal or soaked wooden
    skewers and grill them over medium-high heat for 5-7 minutes,
    turning frequently. Serve with your favorite dipping sauce.""",
    'answer2':
    """
 辣味螺丝钉是一道四川传统美食，以下是一份简单的麻辣螺丝钉食谱：
 所需材料：
 - 螺旋形螺丝钉（或其他适合烹饪的螺丝）500克
 - 干辣椒适量
 - 花椒适量
 - 姜蒜末适量
 - 盐适量
 - 生抽适量
 - 料酒适量
 - 糖适量
 - 鸡精适量
 - 葱花适量
 - 食用油适量
 步骤：
 1. 将螺丝钉用清水洗净，备用。
 2. 干辣椒和花椒用干锅煸炒至香味出来，捞出备用。
 3. 热锅凉油，放入姜蒜末煸炒出香味。
 4. 加入适量的盐、生抽、料酒、糖、鸡精和少量清水，煮沸后放入螺丝钉，翻炒均匀。
 5. 加入煸炒好的干辣椒和花椒，继续翻炒。
 6. 最后加入适量的葱花，翻炒均匀即可。
 注意事项：
 1. 煮沸后不要煮太长时间，以免螺丝钉过熟变硬。
 2. 可根据个人口味调整辣椒和花椒的量。
 """,
    'reference_answer':
    '麻辣螺丝钉并不是一道实际存在的菜。',
    'choice':
    'D',
    'reason':
    '根据参考答案，麻辣螺丝钉并不是一道实际存在的菜。而两个回答均给出了'
    '这样一道不存在的菜的做法，而并未告知用户这道菜不存在，违背了 Helpful 的性质。因此两个回答都不好，选 D'
 }
 example4 = {
    'question':
    '可以概括一下《奥德赛》这本书的内容吗？',
    'answer1':
    """
 The Odyssey is an epic poem by Homer which tells the story of the Greek hero
 Odysseus and his decade-long journey home after the fall of Troy. Odysseus must
 use his wit and courage to battle gods and monsters in order to make his way
 back home to his wife and son. Along the way, he meets a variety of characters,
 learns valuable lessons, and discovers his own heroism.
 """,
    'answer2':
    """
 《奥德赛》是古希腊诗人荷马所作的史诗之一，主要讲述了希腊英雄奥德修斯（Odysseus）
 在特洛伊战争结束后回到家乡伊萨卡岛的冒险旅程。故事分为24卷，其中前12卷描述了他
 与同伴们离开特洛伊后经历的磨难和险阻，包括遭遇塞壬女妖、被巨人波吕斐摩斯关押等；
 而后12卷则着重描写了他在回归家园的过程中面临的各种挑战，例如对抗女神雅典娜变成的
 牧猪人、解决求婚人之乱等。最终，奥德修斯克服重重困难，终于回到了自己的故乡，
 并夺回了王位和家庭。除此之外，《奥德赛》还涉及到了许多古代希腊社会的风俗习惯、
 宗教信仰和道德观念，具有很高的艺术价值和历史价值。
 """,
    'evaluating_guidance':
    '开放性回答，符合题目要求即可。',
    'choice':
    'B',
    'reason':
    '本题的两个回答都正确概括了奥德赛的内容，但回答 1 使用了英文回答，'
    '而回答 2 使用了中文回答。根据 回答语言应与提问语言一致 的原则，回答 1 不好，而回答 2 好，选 B'
 }
 examples = [example1, example2, example3, example4]
 subjective_reader_cfg = dict(input_columns=[
    'question', 'index', 'reference_answer', 'evaluating_guidance',
    'capability'
 ],
                             output_column=None,
                             train_split='test')
 subjective_all_sets = [
    'subjective_demo',
 ]
@LOAD_DATASET.register_module()
 class SubjectiveCmpDataset(BaseDataset):
-    @staticmethod
+    def load(self, path: str, name: str):
-    def load(path: str, name: str):
+        filename = osp.join(path, f'{name}.json')
-        filename = osp.join(path, f'{name}.xlsx')
+        dataset = DatasetDict()
-        reader = pd.read_excel(filename)
+        raw_data = []
-        reader['prompt'] = reader.apply(
+        with open(filename, 'r', encoding='utf-8') as f:
-            lambda row: build_prompt(row['question'],
+            json_data = json.load(f)
-                                     row['reference_answer'],
+            for problem in json_data:
-                                     row['evaluating_guidance'],
+                question = problem['question']
-                                     ics=examples),
+                capability = problem['capability']
-            axis=1)
+                others = problem['others']
-        return Dataset.from_pandas(reader)
+                raw_data.append({
                    'question': question,
                    'others': others,
                    'judge': {
                        'capability': capability
                    }
                })
        dataset = Dataset.from_list(raw_data)
        return dataset
--- a/opencompass/openicl/icl_evaluator/lm_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py
@ -1,8 +1,8 @@
 import os.path as osp
 import random
 from typing import Dict, List, Optional
 import mmengine
 from datasets import Dataset
 from mmengine.config import ConfigDict
 from opencompass.openicl.icl_inferencer import GenInferencer
@ -14,6 +14,23 @@ from opencompass.utils.text_postprocessors import first_number_postprocess
 from opencompass.utils.types import get_type_from_cfg
 def randomize_preds_and_record_references(predictions,
                                          references,
                                          random_order,
                                          seed=2680):
    random.seed(seed)
    list_of_preds = [[] for _ in range(len(predictions))]
    for i in range(len(predictions[0]['model_preds'])):
        preds = [[pred['model_preds'][i], pred['model_name']]
                 for pred in predictions]
        if random_order:
            random.shuffle(preds)
        for j in range(len(preds)):
            list_of_preds[j].append(preds[j][0])
            references[i][f'answer{j+1}'] = preds[j][1]
    return list_of_preds, references
 class LMEvaluator:
    """Evaluate output with language model.
@ -35,7 +52,7 @@ class LMEvaluator:
        prompt_template: ConfigDict,
        judge_cfg: ConfigDict,
        output_path: str,
-        cmp_order: Optional[str] = None,
+        random_order: Optional[bool] = False,
        dataset_cfg: Optional[ConfigDict] = None,
        postprocessor: ConfigDict = dict(type=first_number_postprocess)
    ) -> None:
@ -57,31 +74,20 @@ class LMEvaluator:
        self.postprocessor = get_type_from_cfg(postprocessor)
        self.logger = get_logger()
        self.dataset_cfg = dataset_cfg
-        assert cmp_order in [None, 'as-is', 'reversed', 'both']
+        self.random_order = random_order
        self.cmp_order = cmp_order
    def score(self, predictions, references: Optional[List] = None) -> Dict:
-        if not isinstance(predictions[0], list):
+        if type(predictions) == list:
-            assert self.cmp_order is None, (
+            """Apply to multi-model comparison."""
-                'cmp_order must be None when '
+            references = [{} for _ in range(len(predictions[0]['model_preds']))
-                'only predictions from one model are '
+                          ] if references is None else references
-                'provided.')
+            predictions, references = randomize_preds_and_record_references(
-            predictions = [predictions]
+                predictions, references, self.random_order)
-        else:
+        elif type(predictions) == dict:
-            assert self.cmp_order, ('cmp_order must be specified when '
+            """Apply to single-model scoring."""
-                                    'predictions from multiple models are '
+            references = [{} for _ in range(len(predictions[0]['model_preds']))
-                                    'provided.')
+                          ] if references is None else references
-            if self.cmp_order == 'both':
+            predictions = [predictions['model_preds']]
                predictions = [
                    a + b for a, b in zip(predictions, reversed(predictions))
                ]
                if references:
                    references *= 2
            elif self.cmp_order == 'reversed':
                predictions.reverse()
                if references:
                    references.reverse()
        pred_dict = {}
        for i in range(len(predictions)):
            key = 'prediction' if i == 0 else f'prediction{i + 1}'
@ -89,12 +95,6 @@ class LMEvaluator:
        if self.dataset_cfg:
            dataset = build_dataset_from_cfg(self.dataset_cfg)
            if self.cmp_order == 'both':
                new_ds = {
                    k: dataset.test[k] * 2
                    for k in dataset.test.column_names
                }
                dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
            for k, v in pred_dict.items():
                dataset.reader.dataset['test'] = dataset.test.add_column(k, v)
                dataset.reader.input_columns.append(k)
@ -114,6 +114,7 @@ class LMEvaluator:
                train_split='test'),
                                    reference=references,
                                    **pred_dict)
        dataset.reader.output_column = 'reference'
        retriever = ZeroRetriever(dataset)
        self.inferencer.inference(retriever=retriever,
                                  prompt_template=self.prompt_tmpl)
@ -124,26 +125,4 @@ class LMEvaluator:
    def postprocess(self, output: Dict) -> Dict:
        """Postprocess output by adding necessary statistics or data into
        it."""
        if self.cmp_order is None:
            # Get average scores if the item is presented
            scores = []
            for k, v in output.items():
                score = self.postprocessor(v['prediction'])
                output[k]['score'] = score
                scores.append(score)
            try:
                output['score'] = sum(scores) / len(scores)
            except Exception:
                pass
        if self.cmp_order == 'both':
            half = len(output) // 2
            for k in list(output.keys())[:half]:
                output[k]['cmp_order'] = 'as-is'
            for k in list(output.keys())[half:]:
                output[k]['cmp_order'] = 'reversed'
        elif self.cmp_order in ['as-is', 'reversed']:
            for k in output.keys():
                output[k]['cmp_order'] = self.cmp_order
        return output
--- a/opencompass/partitioners/sub_naive.py
+++ b/opencompass/partitioners/sub_naive.py
@ -1,4 +1,4 @@
-from itertools import combinations
+from itertools import combinations, product
 from typing import Dict, List, Optional, Tuple
 from mmengine.config import ConfigDict
@ -8,6 +8,18 @@ from opencompass.registry import PARTITIONERS
 from .naive import NaivePartitioner
 def remove_duplicate_pairs(model_combinations):
    combo_dict = {}
    for i, combo in enumerate(model_combinations):
        sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr'])))
        if sorted_names not in combo_dict:
            combo_dict[sorted_names] = i
    new_model_combinations = [
        model_combinations[i] for i in combo_dict.values()
    ]
    return new_model_combinations
@PARTITIONERS.register_module()
 class SubjectiveNaivePartitioner(NaivePartitioner):
    """Naive task partitioner for subjective evaluation. Compared to
@ -22,18 +34,34 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
    def __init__(self,
                 mode: str,
                 out_dir: str,
                 models: Optional[List[ConfigDict]] = [],
                 base_models: Optional[List[ConfigDict]] = [],
                 compare_models: Optional[List[ConfigDict]] = [],
                 model_pairs: Optional[List[Tuple]] = None,
                 keep_keys: Optional[List[str]] = None):
        super().__init__(out_dir=out_dir, keep_keys=keep_keys)
-        assert mode in ['all', 'one_to_n', 'fixed']
+        assert mode in ['singlescore', 'allpair', 'm2n', 'fixed']
        self.mode = mode
        self.models = models
        self.base_models = base_models
        self.compare_models = compare_models
        self.model_pairs = model_pairs
-    def get_model_combinations(self, models: List[ConfigDict]) -> List:
+    def get_model_combinations(
-        if self.mode == 'all':
+            self,
            models: List[ConfigDict],
            base_models: Optional[List[ConfigDict]] = [],
            compare_models: Optional[List[ConfigDict]] = []) -> List:
        if self.mode == 'allpair':
            assert len(models) > 1
            return combinations(models, 2)
-        elif self.mode == 'one_to_n':
+        elif self.mode == 'm2n':
-            pass
+            assert len(base_models) > 0 and len(compare_models) > 0
            model_combinations = list(product(base_models, compare_models))
            unique_combinations = remove_duplicate_pairs([
                combo for combo in model_combinations if combo[0] != combo[1]
            ])
            return unique_combinations
        elif self.mode == 'fixed':
            pass
@ -67,8 +95,13 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
        Returns:
            List[Dict]: A list of tasks.
        """
-
+        models = self.models if self.models != [] else models
-        models = self.get_model_combinations(models)
+        base_models, compare_models = self.base_models, self.compare_models
        if self.mode == 'singlescore':
            models = models
        else:
            models = self.get_model_combinations(models, base_models,
                                                 compare_models)
        return super().partition(models=models,
                                 datasets=datasets,
                                 work_dir=work_dir,
--- a/opencompass/summarizers/init.py
+++ b/opencompass/summarizers/init.py
@ -1,9 +1,10 @@
 from .circular import CircularSummarizer
 from .corev2 import Corev2Summarizer
 from .creationv01 import Creationv01Summarizer
 from .default import DefaultSummarizer
 from .subject import SubjectSummarizer
 from .subjective import SubjectiveSummarizer
 __all__ = [
    'CircularSummarizer', 'DefaultSummarizer', 'SubjectiveSummarizer',
-    'SubjectSummarizer'
+    'Corev2Summarizer', 'Creationv01Summarizer'
 ]
--- a/opencompass/summarizers/corev2.py
+++ b/opencompass/summarizers/corev2.py
@ -0,0 +1,172 @@
 # flake8: noqa: E501
 import csv
 import os
 import os.path as osp
 import re
 from collections import defaultdict
 from datetime import datetime
 import mmengine
 from mmengine import ConfigDict
 try:
    from prettytable import from_csv
 except ImportError:
    from_csv = None
 from opencompass.utils import dataset_abbr_from_cfg
 def match_general_answer(s):
    temp = s[0]
    if temp in ['A', 'B', 'C', 'D']:
        return temp
    else:
        return None
 def match_GPT4_answer(s):
    if result := re.findall('(?:选择：|Choice: )([ABCD])', s):
        return result[0]
    else:
        return None
 judge_map = {'smart': match_GPT4_answer, 'other': match_general_answer}
 def call_function(name, arg):
    if name in judge_map:
        return judge_map[name](arg)
    else:
        print('Function not found in the map.')
 class Corev2Summarizer:
    """Do the subjectivity analyze based on evaluation results.
    Args:
        config (ConfigDict): The configuration object of the evaluation task.
            It's expected to be filled out at runtime.
    """
    def __init__(self, config: ConfigDict, match_method='smart') -> None:
        self.tasks = []
        self.cfg = config
        self.match_method = match_method
    def summarize(self,
                  time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
        """Summarize the subjectivity analysis based on evaluation results.
        Args:
            time_str (str): Timestamp for file naming.
        Returns:
            pd.DataFrame: The summary results.
        """
        dataset_cfgs = self.cfg['datasets']
        work_dir = self.cfg['work_dir']
        self.work_dir = work_dir
        self.time_str = time_str
        output_path = osp.join(self.work_dir, 'summary',
                               f'summary_{self.time_str}.txt')
        output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}')
        mmengine.mkdir_or_exist(output_dir)
        results_folder = osp.join(work_dir, 'results')
        fout = osp.join(output_dir, 'report.csv')
        for subdir in os.listdir(results_folder):
            subdir_path = os.path.join(results_folder, subdir)
            if os.path.isdir(subdir_path):
                model1, model2 = subdir.split('_')
                for dataset in dataset_cfgs:
                    dataset_abbr = dataset_abbr_from_cfg(dataset)
                    filepath = os.path.join(subdir_path,
                                            dataset_abbr + '.json')
                    result = mmengine.load(filepath)
                    judged_answers = []
                    references = []
                    for k, v in result.items():
                        judged_answers.append(
                            call_function(self.match_method, v['prediction']))
                        references.append(v['gold'])
                    print(
                        f'Among {len(judged_answers)} judgements, successfully extracted {len(judged_answers)-judged_answers.count(None)} judgements.'
                    )
                    win_both_model1, win_both_model2, half_draw_model1, half_draw_model2, categories = defaultdict(
                        float), defaultdict(float), defaultdict(
                            float), defaultdict(float), defaultdict(float)
                    model1 = references[0]['answer1']
                    model2 = references[0]['answer2']
                    for prediction, reference in zip(judged_answers,
                                                     references):
                        if prediction is not None:
                            categories[reference['capability'].split('-')
                                       [0]] += 1
                            categories[reference['capability']] += 1
                            winner = ''
                            if prediction == 'A':
                                winner = reference['answer1']
                            elif prediction == 'B':
                                winner = reference['answer2']
                            elif prediction == 'C':
                                win_both_model1[reference['capability'].split(
                                    '-')[0]] += 1
                                win_both_model2[reference['capability'].split(
                                    '-')[0]] += 1
                                win_both_model1[reference['capability']] += 1
                                win_both_model2[reference['capability']] += 1
                            if model1 == winner:
                                half_draw_model1[reference['capability'].split(
                                    '-')[0]] += 1
                                win_both_model1[reference['capability'].split(
                                    '-')[0]] += 1
                                half_draw_model1[reference['capability']] += 1
                                win_both_model1[reference['capability']] += 1
                            elif model2 == winner:
                                half_draw_model2[reference['capability'].split(
                                    '-')[0]] += 1
                                win_both_model2[reference['capability'].split(
                                    '-')[0]] += 1
                                half_draw_model2[reference['capability']] += 1
                                win_both_model2[reference['capability']] += 1
                    for capability in categories:
                        if capability not in half_draw_model1:
                            win_both_model1[capability] = 0.0
                            half_draw_model1[capability] = 0.0
                        else:
                            win_both_model1[capability] = round(
                                (win_both_model1[capability] /
                                 categories[capability]) * 100, 2)
                            half_draw_model1[capability] = round(
                                (half_draw_model1[capability] /
                                 categories[capability]) * 100, 2)
                        if capability not in half_draw_model2:
                            win_both_model2[capability] = 0.0
                            half_draw_model2[capability] = 0.0
                        else:
                            win_both_model2[capability] = round(
                                (win_both_model2[capability] /
                                 categories[capability]) * 100, 2)
                            half_draw_model2[capability] = round(
                                (half_draw_model2[capability] /
                                 categories[capability]) * 100, 2)
                    scores = {
                        'win_both_' + model1: win_both_model1,
                        'half_draw_' + model1: half_draw_model1,
                        'win_both_' + model2: win_both_model2,
                        'half_draw_' + model2: half_draw_model2
                    }
                    rows = list(scores.keys())
                    columns = list(scores[rows[0]].keys())
                    with open(fout, 'a+', newline='') as csvfile:
                        writer = csv.writer(csvfile)
                        writer.writerow([model1 + '_vs_' + model2] + columns)
                        for row in rows:
                            writer.writerow(
                                [row] +
                                [scores[row][column] for column in columns])
        with open(fout, 'r') as f:
            x = from_csv(f)
        print(x)
--- a/opencompass/summarizers/creationv01.py
+++ b/opencompass/summarizers/creationv01.py
@ -0,0 +1,124 @@
 # flake8: noqa: E501
 import csv
 import os
 import os.path as osp
 import re
 from collections import defaultdict
 from datetime import datetime
 import mmengine
 from mmengine import ConfigDict
 try:
    from prettytable import from_csv
 except ImportError:
    from_csv = None
 from opencompass.utils import dataset_abbr_from_cfg
 def match_general_answer(s):
    temp = s[0]
    if temp in ['A', 'B', 'C', 'D']:
        return temp
    else:
        return None
 def match_GPT4_answer(s):
    result = re.search(r'分数：(.)', s)
    if result:
        return int(result.group(1))
    else:
        return None
 judge_map = {'smart': match_GPT4_answer, 'other': match_general_answer}
 def call_function(name, arg):
    if name in judge_map:
        return judge_map[name](arg)
    else:
        print('Function not found in the map.')
 class Creationv01Summarizer:
    """Do the subjectivity analyze based on evaluation results.
    Args:
        config (ConfigDict): The configuration object of the evaluation task.
            It's expected to be filled out at runtime.
    """
    def __init__(self, config: ConfigDict, match_method='smart') -> None:
        self.tasks = []
        self.cfg = config
        self.match_method = match_method
    def summarize(self,
                  time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
        """Summarize the subjectivity analysis based on evaluation results.
        Args:
            time_str (str): Timestamp for file naming.
        Returns:
            pd.DataFrame: The summary results.
        """
        dataset_cfgs = self.cfg['datasets']
        work_dir = self.cfg['work_dir']
        self.work_dir = work_dir
        self.time_str = time_str
        output_path = osp.join(self.work_dir, 'summary',
                               f'summary_{self.time_str}.txt')
        output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}')
        mmengine.mkdir_or_exist(output_dir)
        results_folder = osp.join(work_dir, 'results')
        fout = osp.join(output_dir, 'report.csv')
        for subdir in os.listdir(results_folder):
            subdir_path = os.path.join(results_folder, subdir)
            if os.path.isdir(subdir_path):
                model = subdir
                for dataset in dataset_cfgs:
                    dataset_abbr = dataset_abbr_from_cfg(dataset)
                    filepath = os.path.join(subdir_path,
                                            dataset_abbr + '.json')
                    result = mmengine.load(filepath)
                    judged_answers = []
                    references = []
                    for k, v in result.items():
                        judged_answers.append(
                            call_function(self.match_method, v['prediction']))
                        references.append(v['gold'])
                    print(
                        f'Among {len(judged_answers)} judgements, successfully extracted {len(judged_answers)-judged_answers.count(None)} judgements.'
                    )
                    model_scores, categories = defaultdict(float), defaultdict(
                        float)
                    for prediction, reference in zip(judged_answers,
                                                     references):
                        categories[reference['capability']] += 1
                        if prediction is not None:
                            model_scores[reference['capability']] += prediction
                    for capability in categories:
                        if capability not in model_scores:
                            model_scores[capability] = 0.0
                        else:
                            model_scores[capability] = round(
                                model_scores[capability] /
                                categories[capability], 2)
                    scores = {model: model_scores}
                    rows = list(scores.keys())
                    columns = list(scores[rows[0]].keys())
                    with open(fout, 'a+', newline='') as csvfile:
                        writer = csv.writer(csvfile)
                        writer.writerow([''] + columns)
                        for row in rows:
                            writer.writerow(
                                [row] +
                                [scores[row][column] for column in columns])
        with open(fout, 'r') as f:
            x = from_csv(f)
        print(x)
--- a/opencompass/summarizers/subject.py
+++ b/opencompass/summarizers/subject.py
@ -1,80 +0,0 @@
 import csv
 import os
 import os.path as osp
 from datetime import datetime
 import mmengine
 from mmengine import ConfigDict
 try:
    from prettytable import from_csv
 except ImportError:
    from_csv = None
 from opencompass.utils import dataset_abbr_from_cfg
 class SubjectSummarizer:
    """Do the subjectivity analyze based on evaluation results.
    Args:
        config (ConfigDict): The configuration object of the evaluation task.
            It's expected to be filled out at runtime.
    """
    def __init__(
        self,
        config: ConfigDict,
    ) -> None:
        self.tasks = []
        self.cfg = config
    def summarize(self,
                  time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
        """Summarize the subjectivity analysis based on evaluation results.
        Args:
            time_str (str): Timestamp for file naming.
        Returns:
            pd.DataFrame: The summary results.
        """
        dataset_cfgs = self.cfg['datasets']
        work_dir = self.cfg['work_dir']
        self.work_dir = work_dir
        self.time_str = time_str
        output_path = osp.join(self.work_dir, 'summary',
                               f'summary_{self.time_str}.txt')
        output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}')
        mmengine.mkdir_or_exist(output_dir)
        results_folder = osp.join(work_dir, 'results')
        for subdir in os.listdir(results_folder):
            subdir_path = os.path.join(results_folder, subdir)
            if os.path.isdir(subdir_path):
                for dataset in dataset_cfgs:
                    model1, model2 = dataset['eval_cfg']['evaluator'][
                        'base_model'], dataset['eval_cfg']['evaluator'][
                            'compare_model']
                    dataset_abbr = dataset_abbr_from_cfg(dataset)
                    filepath = os.path.join(subdir_path,
                                            dataset_abbr + '.json')
                    result = mmengine.load(filepath)
                    rows = list(result.keys())
                    columns = list(result[rows[0]].keys())
                    fout = osp.join(output_dir,
                                    model1 + '_vs_' + model2 + '.csv')
                    print(
                        '###############################Subjective Results on '
                        + model1 + '_vs_' + model2 +
                        '###############################')
                    with open(fout, 'w', newline='') as csvfile:
                        writer = csv.writer(csvfile)
                        writer.writerow([model1 + '_vs_' + model2] + columns)
                        for row in rows:
                            writer.writerow(
                                [row] +
                                [result[row][column] for column in columns])
                    with open(fout, 'r') as f:
                        x = from_csv(f)
                    print(x)
--- a/opencompass/tasks/subjective_eval.py
+++ b/opencompass/tasks/subjective_eval.py
@ -10,13 +10,11 @@ import mmengine
 from mmengine.config import Config, ConfigDict
 from mmengine.utils import mkdir_or_exist
 from opencompass.openicl.icl_evaluator.lm_evaluator import LMEvaluator
 from opencompass.registry import ICL_EVALUATORS, MODELS, TEXT_POSTPROCESSORS
 from opencompass.tasks.base import BaseTask
 from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
                               get_infer_output_path, get_logger,
                               task_abbr_from_cfg)
 from opencompass.utils.types import get_type_from_cfg
 class SubjectiveEvalTask(BaseTask):
@ -137,8 +135,7 @@ class SubjectiveEvalTask(BaseTask):
                kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
                proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
                pred_strs = [proc(s, **kwargs) for s in pred_strs]
-
+        return {'model_name': model_cfg['abbr'], 'model_preds': pred_strs}
        return pred_strs
    def _score(self, model_cfg, dataset_cfg, eval_cfg, output_column):
        test_set = build_dataset_from_cfg(dataset_cfg).test
@ -153,17 +150,12 @@ class SubjectiveEvalTask(BaseTask):
                return sample
            test_set = test_set.map(postprocess)
        # Get out_path
        out_path = get_infer_output_path(model_cfg, dataset_cfg,
                                         osp.join(self.work_dir, 'results'))
        model_preds = self._load_model_pred(model_cfg, dataset_cfg, eval_cfg)
        if get_type_from_cfg(eval_cfg['evaluator']) == LMEvaluator:
        if not self.judge_cfg:
-                raise ValueError('Using LMEvaluator in dataset, but '
+            raise ValueError('missing "eval.runner.task.judge_cfg"')
                                 'missing "eval.runner.task.judge_cfg" '
                                 'as the judge configuration.')
        eval_cfg['evaluator']['judge_cfg'] = self.judge_cfg
        eval_cfg['evaluator']['dataset_cfg'] = dataset_cfg
        eval_cfg['evaluator']['output_path'] = out_path
@ -177,7 +169,8 @@ class SubjectiveEvalTask(BaseTask):
                f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
            return
        else:
-            self.logger.info(f'Task {task_abbr_from_cfg(self.cfg)}: {result}')
+            self.logger.info(
                f'Task {task_abbr_from_cfg(self.cfg)}')  #: {result}')
        # Save result
        mkdir_or_exist(osp.split(out_path)[0])