[Fix] Compatible with old versions (#1616)

* fix pip version * fix pip version * Compatible with old versions * compati old version * compati old version * compati old version * update configs
2025-05-30 16:03:24 +08:00 · 2024-10-21 10:16:29 +08:00 · 2024-10-21 10:16:29 +08:00 · a11e2b2fd4
commit a11e2b2fd4
parent 6e8adf5221
52 changed files with 2337 additions and 54 deletions
--- a/configs/datasets/compassbench_v1_3/compassbench_v1_3_objective_gen.py
+++ b/configs/datasets/compassbench_v1_3/compassbench_v1_3_objective_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .compassbench_v1_3_objective_gen_068af0 import compassbench_aug_datasets  # noqa: F401, F403
--- a/configs/datasets/compassbench_v1_3/compassbench_v1_3_objective_gen_068af0.py
+++ b/configs/datasets/compassbench_v1_3/compassbench_v1_3_objective_gen_068af0.py
@ -0,0 +1,74 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets.compassbench_obj import CompassBenchObjectiveV1_3, compassbench_objective_v1_3_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+
+prompt_cn = {
+    'single_choice_cn': '以下是一道单项选择题，请你根据你了解的知识给出正确的答案选项。请你一步步推理并在最后用“答案选项为X”来回答，其中X是ABCD中你认为正确的选项序号\n下面是你要回答的题目：\n{question}\n让我们一步步解决这个问题：',
+    'cloze_cn': '以下是一道填空题，请你根据你了解的知识一步步思考后把你的最终答案放到\\boxed{}中。\n下面是你要回答的题目：\n{question}\n让我们一步步解决这个问题：',
+}
+
+prompt_en = {
+    'single_choice_en': "Here is a single-choice question. Please give the correct answer based on your knowledge. Please reason step by step and answer with 'The answer is X' at the end, where X is the option number you think is correct.\nHere is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
+    'cloze_en': "Here is a fill-in-the-blank question. Please think step by step based on your knowledge and put your final answer in \\boxed{}. Here is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
+}
+
+
+douknow_sets = {
+    'knowledge': ['single_choice_cn'],
+    'math': ['single_choice_cn'],
+}
+
+# Set up the prompts
+CircularEval = True
+
+
+compassbench_aug_datasets = []
+
+for _split in list(douknow_sets.keys()):
+    for _name in douknow_sets[_split]:
+        if 'cn' in _name:
+            single_choice_prompts = prompt_cn
+            cloze_prompts = prompt_cn
+        else:
+            single_choice_prompts = prompt_en
+            cloze_prompts = prompt_en
+        douknow_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin='</E>',
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt= single_choice_prompts[_name],
+                        ),
+                        dict(role='BOT', prompt='{answer}'),] if 'choice' in _name else cloze_prompts[_name],
+                    ),
+                ice_token='</E>',
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        )
+        douknow_eval_cfg = dict(
+            evaluator=dict(type=CircularEvaluator if CircularEval else AccEvaluator) if 'single_choice' in _name else dict(type=AccEvaluator),
+            pred_postprocessor=dict(type=first_option_postprocess, options='ABCD' ) if 'single_choice' in _name else dict(type=compassbench_objective_v1_3_postprocess, name=_name))
+
+        compassbench_aug_datasets.append(
+            dict(
+                type=CompassBenchObjectiveV1_3,
+                path=f'./data/compassbench_v1_3/{_split}/{_name}.jsonl',
+                name='circular_' + _name if CircularEval else _name,
+                abbr='compassbench-' + _split + '-' + _name + 'circular'if CircularEval else '',
+                reader_cfg=dict(
+                    input_columns=['question'],
+                    output_column='answer'
+                    ),
+                infer_cfg=douknow_infer_cfg,
+                eval_cfg=douknow_eval_cfg,
+            ))
+
+del _split, _name
--- a/configs/datasets/gpqa/gpqa_few_shot_ppl_2c9cd6.py
+++ b/configs/datasets/gpqa/gpqa_few_shot_ppl_2c9cd6.py
@ -0,0 +1,49 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import GPQADataset, GPQAEvaluator
+from opencompass.utils import first_option_postprocess
+
+gpqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D'],
+    output_column='answer')
+
+hint = f'对下面的单项选择题，请直接给出正确答案的选项。'
+question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
+gpqa_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template={
+        opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
+        ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+        opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
+        },
+        ice_token='</E>'
+        ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+    inferencer=dict(type=PPLInferencer))
+
+gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
+
+gpqa_datasets = []
+gpqa_subsets = {
+    # 'extended': 'gpqa_extended.csv',
+    # 'main': 'gpqa_main.csv',
+    'diamond': 'gpqa_diamond.csv'
+}
+
+for split in list(gpqa_subsets.keys()):
+    gpqa_datasets.append(
+        dict(
+            abbr='GPQA_' + split,
+            type=GPQADataset,
+            path='./data/gpqa/',
+            name=gpqa_subsets[split],
+            reader_cfg=gpqa_reader_cfg,
+            infer_cfg=gpqa_infer_cfg,
+            eval_cfg=gpqa_eval_cfg)
+    )
--- a/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py
+++ b/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py
@ -0,0 +1,47 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator
+
+with read_base():
+    from .mmlu_pro_categories import categories
+
+mmlu_pro_datasets = []
+
+for category in categories:
+    hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.'
+    question_and_options = 'Question:\n{question}\nOptions:\n{options_str}'
+    mmlu_pro_reader_cfg = dict(
+        input_columns=['question', 'cot_content', 'options_str'],
+        output_column='answer_string',
+        train_split='validation',
+        test_split='test',
+    )
+    mmlu_pro_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=f'{question_and_options}\nAnswer: {{answer}}'),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f'{hint}\n</E>{question_and_options}\nAnswer: ',
+            ice_token='</E>'
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+            inferencer=dict(type=GenInferencer, max_out_len=100)
+    )
+
+    mmlu_pro_eval_cfg = dict(
+        evaluator=dict(type=MMLUProBaseEvaluator)
+    )
+
+    mmlu_pro_datasets.append(
+        dict(
+            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
+            type=MMLUProDataset,
+            path='opencompass/mmlu_pro',
+            category=category,
+            reader_cfg=mmlu_pro_reader_cfg,
+            infer_cfg=mmlu_pro_infer_cfg,
+            eval_cfg=mmlu_pro_eval_cfg,
+        ))
--- a/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py
+++ b/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
+from opencompass.summarizers import AlignmentBenchSummarizer
+from opencompass.datasets import AlignmentBenchDataset

 subjective_reader_cfg = dict(
    input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
        ),
        pred_role='BOT',
    )
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
        ))
--- a/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py
+++ b/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py
@ -0,0 +1,66 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'capability', 'critiquellm_prefix'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'alignment_bench',
+]
+data_path ='data/subjective/alignment_bench'
+
+alignment_bench_config_path = 'data/subjective/alignment_bench/config'
+alignment_bench_config_name = 'multi-dimension'
+
+alignbench_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
+        ),
+        pred_role='BOT',
+    )
+
+    alignbench_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=AlignmentBenchDataset,
+            path=data_path,
+            name=_name,
+            alignment_bench_config_path=alignment_bench_config_path,
+            alignment_bench_config_name=alignment_bench_config_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
+++ b/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
+from opencompass.summarizers import AlignmentBenchSummarizer
+from opencompass.datasets import AlignmentBenchDataset

 subjective_reader_cfg = dict(
    input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
        ),
        pred_role='BOT',
    )
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
        ))
--- a/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py
+++ b/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py
@ -0,0 +1,66 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'capability', 'critiquellm_prefix'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'alignment_bench_v1_1', # Changed to Alignbench_v1_1 since 06/15/2024, refer to https://github.com/THUDM/AlignBench
+]
+data_path ='data/subjective/alignment_bench'
+
+alignment_bench_config_path = 'data/subjective/alignment_bench/config'
+alignment_bench_config_name = 'multi-dimension'
+
+alignbench_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
+        ),
+        pred_role='BOT',
+    )
+
+    alignbench_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=AlignmentBenchDataset,
+            path=data_path,
+            name=_name,
+            alignment_bench_config_path=alignment_bench_config_path,
+            alignment_bench_config_name=alignment_bench_config_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
+++ b/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
+from opencompass.datasets import SubjectiveCmpDataset
+from opencompass.summarizers import AlpacaSummarizer
 from mmengine.config import read_base

 subjective_reader_cfg = dict(
@ -94,7 +95,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=alpacaeval_postprocess),
        ),
        pred_role='BOT',
    )
@ -102,7 +102,7 @@ for _name in subjective_all_sets:
    alpacav2_datasets.append(
        dict(
            abbr=f'{_name}',
-            type=AlpacaEvalDataset,
+            type=SubjectiveCmpDataset,
            path='./data/subjective/alpaca_eval',
            name=_name,
            reader_cfg=subjective_reader_cfg,
@ -111,5 +111,6 @@ for _name in subjective_all_sets:
            mode='m2n',
            infer_order='random',
            base_models=gpt4,
-            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
+            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}],
+            summarizer=dict(type=AlpacaSummarizer, judge_type='v2'),
        ))
--- a/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py
+++ b/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py
@ -0,0 +1,115 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'alpaca_eval',
+]
+
+
+alpacav2_datasets = []
+
+gpt4_prompt = """
+I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
+
+## Instruction
+
+{
+    "instruction": "{question}",
+}
+
+## Model Outputs
+
+Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
+
+{
+    {
+        "model_identifier": "m",
+        "output": "{prediction}"
+    },
+    {
+        "model_identifier": "M",
+        "output": "{prediction2}"
+    }
+}
+
+## Task
+
+Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
+
+## Best Model Identifier
+"""
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+gpt4 = [dict(
+    abbr='gpt4-turbo',
+)]
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.')
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = gpt4_prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=alpacaeval_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    alpacav2_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=AlpacaEvalDataset,
+            path='./data/subjective/alpaca_eval',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='random',
+            base_models=gpt4,
+            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
+        ))
--- a/configs/datasets/subjective/arena_hard/arena_hard_compare.py
+++ b/configs/datasets/subjective/arena_hard/arena_hard_compare.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
+from opencompass.datasets import ArenaHardDataset
+from opencompass.summarizers import ArenaHardSummarizer
 from mmengine.config import read_base

 subjective_reader_cfg = dict(
@ -59,7 +60,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=arenahard_postprocess),
        ),
        pred_role='BOT',
    )
@ -76,5 +76,6 @@ for _name in subjective_all_sets:
            mode='m2n',
            infer_order='double',
            base_models=gpt4,
-            given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
+            given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}],
+            summarizer = dict(type=ArenaHardSummarizer),
        ))
--- a/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py
+++ b/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py
@ -0,0 +1,80 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'arenahard',
+]
+
+
+arenahard_datasets = []
+
+system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
+
+judge_prompt = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{prediction}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{prediction2}\n<|The End of Assistant B's Answer|>"
+
+gpt4 = [dict(
+    abbr='gpt4-0314',
+)]
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt=system_prompt)
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = judge_prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=arenahard_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    arenahard_datasets.append(
+        dict(
+            abbr='arenahard',
+            type=ArenaHardDataset,
+            path='./data/subjective/arena_hard',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='double',
+            base_models=gpt4,
+            given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
+        ))
--- a/configs/datasets/subjective/compassarena/compassarena_compare.py
+++ b/configs/datasets/subjective/compassarena/compassarena_compare.py
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
+from opencompass.datasets import CompassArenaDataset
 from opencompass.summarizers import CompassArenaSummarizer

 subjective_reader_cfg = dict(
@ -15,23 +15,29 @@ data_path ='data/subjective/compass_arena'
 compassarena_datasets = []

 base_prompt = """
+
 [回答1开始]
 {prediction}
 [回答1结束]
+
 [回答2开始]
 {prediction2}
 [回答2结束]
+
 根据评分要求，在以下 3 个选项中做出选择:
 A. 回答1更好
 B. 回答2更好
 C. 回答1、2平局
 并提供你的解释原因。
+
 如果你认为回答1更好，你的输出应形如：
 选择：A
 原因：blahblah blahblah\n
+
 如果你认为回答2更好，你的输出应形如：
 选择：B
 原因：blahblah blahblah\n
+
 如果你认为回答1、2打成平手，你的输出应形如：
 选择：C
 原因：blahblah blahblah\n
@ -43,8 +49,10 @@ knowledge_prompt = """
 1. 更好的回答能与参考答案吻合或表明参考答案的意思。
 2. 在都准确答对问题的前提下，更好的回答能对知识点进行额外补充，且补充的知识准确无误。
 3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+
 [用户问题]
 {question}
+
 [参考答案]
 {ref}
 """ + base_prompt
@ -56,8 +64,10 @@ language_prompt = """
 1. 在有明确的参考答案的情况下，越贴近参考答案或表明了参考答案的意思的回答越好。
 2. 更好的回答在语言表达上更流畅，更加符合与人类对话的习惯，包括语气、情调等
 3. 在都准确答对问题的前提下，更好的回答能进行额外补充，且补充的内容准确无误。
+
 [用户问题]
 {question}
+
 [参考答案]
 {ref}
 """ + base_prompt
@ -69,8 +79,10 @@ math_prompt = """
 1. 更好的回答的答案能和参考答案一致。
 2. 若两个回答的答案都与参考答案不一致，则更好的回答的推理过程应更加合理。
 3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+
 [用户问题]
 {question}
+
 [参考答案]
 {ref}
 """ + base_prompt
@ -83,6 +95,7 @@ creation_prompt = """
 1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
 2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
 3. 好的回答必须具有创造性的词语和表达丰富度
+
 [用户问题]
 {question}
 """ + base_prompt
@ -120,7 +133,6 @@ for _name, _prompt in sub_map.items():
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
        ),
        pred_role='BOT',
    )
@ -137,6 +149,6 @@ for _name, _prompt in sub_map.items():
            mode='m2n',
            infer_order='double',
            base_models=gpt4,
-            # summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
+            summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
        ))
--- a/configs/datasets/subjective/compassarena/compassarena_compare_new.py
+++ b/configs/datasets/subjective/compassarena/compassarena_compare_new.py
@ -0,0 +1,142 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
+from opencompass.summarizers import CompassArenaSummarizer
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'ref'],
+    output_column='judge',
+    )
+
+data_path ='data/subjective/compass_arena'
+
+compassarena_datasets = []
+
+base_prompt = """
+[回答1开始]
+{prediction}
+[回答1结束]
+[回答2开始]
+{prediction2}
+[回答2结束]
+根据评分要求，在以下 3 个选项中做出选择:
+A. 回答1更好
+B. 回答2更好
+C. 回答1、2平局
+并提供你的解释原因。
+如果你认为回答1更好，你的输出应形如：
+选择：A
+原因：blahblah blahblah\n
+如果你认为回答2更好，你的输出应形如：
+选择：B
+原因：blahblah blahblah\n
+如果你认为回答1、2打成平手，你的输出应形如：
+选择：C
+原因：blahblah blahblah\n
+"""
+
+knowledge_prompt = """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答能与参考答案吻合或表明参考答案的意思。
+2. 在都准确答对问题的前提下，更好的回答能对知识点进行额外补充，且补充的知识准确无误。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+
+language_prompt = """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 在有明确的参考答案的情况下，越贴近参考答案或表明了参考答案的意思的回答越好。
+2. 更好的回答在语言表达上更流畅，更加符合与人类对话的习惯，包括语气、情调等
+3. 在都准确答对问题的前提下，更好的回答能进行额外补充，且补充的内容准确无误。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+
+math_prompt = """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答的答案能和参考答案一致。
+2. 若两个回答的答案都与参考答案不一致，则更好的回答的推理过程应更加合理。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+reason_prompt = math_prompt
+
+creation_prompt = """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
+2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
+3. 好的回答必须具有创造性的词语和表达丰富度
+[用户问题]
+{question}
+""" + base_prompt
+
+sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
+
+gpt4 = [dict(
+    abbr='gpt4-turbo',
+)]
+
+for _name, _prompt in sub_map.items():
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = _prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
+        ),
+        pred_role='BOT',
+    )
+
+    compassarena_datasets.append(
+        dict(
+            abbr=f'compassarena_{_name}',
+            type=CompassArenaDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='double',
+            base_models=gpt4,
+            # summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
+            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
+        ))
--- a/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
+++ b/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import FofoDataset, fofo_postprocess
+from opencompass.datasets import FofoDataset
+from opencompass.summarizers import FofoSummarizer
 from mmengine.config import read_base

 subjective_reader_cfg = dict(
@ -112,7 +113,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=fofo_postprocess),
        ),
        pred_role='BOT',
    )
@ -127,4 +127,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=FofoSummarizer, judge_type='general')
        ))
--- a/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py
+++ b/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py
@ -0,0 +1,130 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import FofoDataset, fofo_postprocess
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'fofo_test_prompts', 'fofo_test_prompts_cn',
+]
+
+base_prompt_en = """
+I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
+
+Here is the prompt:
+{
+    "instruction": "{question}",
+}
+
+Here are the outputs of the models:
+[
+    {
+        "model": "model",
+        "answer": "{prediction}"
+    },
+]
+
+Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
+```json
+[
+    {
+        'model': <model-name>,
+        'format_correctness': <correctness>,
+        'reasons': <reasons-of-format-correctness>
+    }
+]
+```
+
+Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
+"""
+
+
+base_prompt_cn = """
+我希望你创建一个排行榜，用于评估来自各种大型语言模型的回答格式的正确性。为了完成这个任务，你将需要分析给模型的文本提示以及它们对应的回答。具体来说，请确保你的评估输出正确地格式化为JSON字符串。我将为此提供提示和回答。
+以下是提示内容：
+{
+    "instruction": "{question}",
+}
+以下是模型的输出结果：
+[
+    {
+        "model": "model",
+        "answer": "{prediction}"
+    },
+]
+请通过检查模型回答是否符合提示中声明的格式规范来评估模型回答的格式。进行彻底的格式检查，并提供格式正确或错误的详细解释。你的反馈应包括模型的名称，接着是格式正确性的状态，用'1'表示正确，'0'表示错误。将你的推理以每个评估模型的单个字符串中的 bullet 点形式呈现。换句话说，你应该生成以下输出：
+```json
+[
+    {
+        'model': <模型名称>,
+        'format_correctness': <正确性>,
+        'reasons': <格式正确性的原因>
+    }
+]
+```
+请注意，你的回答应是一个正确格式化的JSON字符串，不应包含任何额外的内容。我们将在Python中直接将其作为JSON字符串加载。
+"""
+
+
+fofo_datasets = []
+
+for _name in subjective_all_sets:
+    if '_cn' in _name:
+        base_prompt = base_prompt_cn
+    else:
+        base_prompt = base_prompt_en
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = base_prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=fofo_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    fofo_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=FofoDataset,
+            path='./data/subjective/fofo',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/configs/datasets/subjective/fofo/fofo_judge.py
+++ b/configs/datasets/subjective/fofo/fofo_judge.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import FofoDataset, fofo_postprocess
+from opencompass.datasets import FofoDataset
+from opencompass.summarizers import FofoSummarizer
 from mmengine.config import read_base

 subjective_reader_cfg = dict(
@ -80,7 +81,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=fofo_postprocess),
        ),
        pred_role='BOT',
    )
@ -95,4 +95,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=FofoSummarizer, judge_type='general')
        ))
--- a/configs/datasets/subjective/fofo/fofo_judge_new.py
+++ b/configs/datasets/subjective/fofo/fofo_judge_new.py
@ -0,0 +1,98 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import FofoDataset, fofo_postprocess
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'fofo_test_prompts'
+]
+
+base_prompt = """
+I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
+
+Here is the prompt:
+{
+    "instruction": "{question}",
+}
+
+Here are the outputs of the models:
+[
+    {
+        "model": "model",
+        "answer": "{prediction}"
+    },
+]
+
+Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
+```json
+[
+    {
+        'model': <model-name>,
+        'format_correctness': <correctness>,
+        'reasons': <reasons-of-format-correctness>
+    }
+]
+```
+
+Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
+"""
+
+fofo_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = base_prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=fofo_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    fofo_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=FofoDataset,
+            path='./data/subjective/fofo',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/configs/datasets/subjective/followbench/followbench_llmeval.py
+++ b/configs/datasets/subjective/followbench/followbench_llmeval.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import FollowBenchDataset, followbench_postprocess
+from opencompass.datasets import FollowBenchDataset
+from opencompass.summarizers import FollowBenchSummarizer

 subjective_reader_cfg = dict(
    input_columns=['instruction', 'judge_prompt',],
@ -43,7 +44,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=followbench_postprocess),
        ),
        pred_role='BOT',
    )
@ -59,4 +59,5 @@ for _name in subjective_all_sets:
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
+            summarizer = dict(type=FollowBenchSummarizer,)
        ))
--- a/configs/datasets/subjective/followbench/followbench_llmeval_new.py
+++ b/configs/datasets/subjective/followbench/followbench_llmeval_new.py
@ -0,0 +1,62 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import FollowBenchDataset, followbench_postprocess
+
+subjective_reader_cfg = dict(
+    input_columns=['instruction', 'judge_prompt',],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'followbench_llmeval_cn', 'followbench_llmeval_en',
+]
+data_path ='data/subjective/followbench/converted_data'
+
+followbench_llmeval_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{instruction}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{judge_prompt}'
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=followbench_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    followbench_llmeval_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=FollowBenchDataset,
+            path=data_path,
+            name=_name,
+            mode='singlescore',
+            cate='llm',
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+        ))
--- a/configs/datasets/subjective/hellobench/README.md
+++ b/configs/datasets/subjective/hellobench/README.md
@ -0,0 +1,51 @@
+# Guideline for evaluating HelloBench on Diverse LLMs
+
+HelloBench is a comprehenvise, in-the-wild, and open-ended benchmark to evaluate LLMs' performance in generating long text. More details could be found in [🌐Github Repo](https://github.com/Quehry/HelloBench) and [📖Paper](https://arxiv.org/abs/2409.16191).
+
+## Detailed instructions to evalute HelloBench in Opencompass
+
+1. Git clone Opencompass
+
+```shell
+cd ~
+git clone git@github.com:open-compass/opencompass.git
+cd opencompass
+```
+
+2. Download HelloBench data in [Google Drive Url](https://drive.google.com/file/d/1EJTmMFgCs2pDy9l0wB5idvp3XzjYEsi9/view?usp=sharing), unzip it and put it in the following path(OPENCOMPASS_PATH/data/HelloBench), make sure you get path like this:
+
+```
+~/opencompass/data/
+└── HelloBench
+    ├── chat.jsonl
+    ├── heuristic_text_generation.jsonl
+    ├── length_constrained_data
+    │   ├── heuristic_text_generation_16k.jsonl
+    │   ├── heuristic_text_generation_2k.jsonl
+    │   ├── heuristic_text_generation_4k.jsonl
+    │   └── heuristic_text_generation_8k.jsonl
+    ├── open_ended_qa.jsonl
+    ├── summarization.jsonl
+    └── text_completion.jsonl
+```
+
+3. Setup your opencompass
+
+```
+cd ~/opencompass
+pip install -e .
+```
+
+4. configuration your launch in configs/eval_hellobench.py
+
+- set your models to be evaluated
+
+- set your judge model (we recommend to use gpt4o-mini)
+
+5. launch it!
+
+```
+python run.py configs/eval_hellobench.py
+```
+
+6. After that, you could find the results in outputs/hellobench/xxx/summary
--- a/configs/datasets/subjective/hellobench/hellobench.py
+++ b/configs/datasets/subjective/hellobench/hellobench.py
@ -0,0 +1,111 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import HelloBenchDataset, hellobench_postprocess
+
+system_prompt = """You are a helpful evaluator. Your task is to evaluate the checklists of the responses given by the Large Language Models (LLMs) based on user instructions. These checklists consist of yes or no questions."""
+
+user_prompt = """Your core task is to evaluate the checklists based on the user’s instruction and LLM’s response, with each checklist item being a yes or no question indicating a specific aspect that the LLM’s response should meet. You need to judge the checklist item based on the instruction and response. The evaluation results are scored from 0 to 1, with 5 scores in total, which are:
+
+0: The response fails to meet the checklist requirements, demonstrating substantial need for improvement across multiple areas.
+0.25: The response partially meets some checklist requirements, but significant elements remain unaddressed.
+0.5: The response meets several checklist requirements, yet the overall evaluation appears ambiguous or unclear.
+0.75: The response aligns with most checklist requirements, though there are still minor areas that could be refined or enhanced.
+1: The response fully satisfies all checklist requirements, with no identifiable issues or areas for improvement. It means this response is already perfect; you can't find any significant flaws in it.
+
+Here is the instruction:
+{{\"instruction\": {instruction}}}
+
+Here is the response given by LLM:
+{{\"response\": {prediction}}}
+
+Since the response may be rather long, I am specifically reminding you here that the response has ended.
+
+Here are checklists of this instruction:
+{{\"checklists\": {formatted_checklists}}}
+
+To further remind you, I will repeat my requirements:
+
+Your core task is to evaluate the checklists based on the user’s instruction and LLM’s response, with each checklist item being a yes or no question indicating a specific aspect that the LLM’s response should meet. You need to judge the checklist item based on the instruction and response. The evaluation results are scored from 0 to 1, with 5 scores in total, which are:
+
+0: The response fails to meet the checklist requirements, demonstrating substantial need for improvement across multiple areas.
+0.25: The response partially meets some checklist requirements, but significant elements remain unaddressed.
+0.5: The response meets several checklist requirements, yet the overall evaluation appears ambiguous or unclear.
+0.75: The response aligns with most checklist requirements, though there are still minor areas that could be refined or enhanced.
+1: The response fully satisfies all checklist requirements, with no identifiable issues or areas for improvement. It means this response is already perfect; you can't find any significant flaws in it.
+
+Always provide the reason for your evaluation results. You should be strict but fair in your evaluation. A score of 1 means that the response perfectly meets all the checklist requirements and you think there are really no room for improvements. When giving a score of 1, you need to carefully consider whether this checklist has been perfectly satisfied.
+
+Evaluate all the checklists and return the evaluation results of the checklists. Output a Python List consisting of the Python Dictionary formatted as follows:
+[{{\"checklist_id\": \"the id of the checklist\", \"reason\": \"The reason for your evaluation results\", \"evaluation_score\": \"Your evaluation score for this checklist\"}},{{\"checklist_id\": \"the id of the checklist\",
+\"reason\": \"The reason for your evaluation results\", \"evaluation_score\": \"Your evaluation score for this checklist\"}}]
+
+There are total {num_checklist} checklists that you need to evaluate. The length of the output list is equal to the number of checklists and you should give an evaluation score for each checklist. You shoule be very very very strict to the evalution to further compare the responses from different models. Your response must be a valid Python List and should contain nothing else, as it will be directly executed in Python."""
+
+subjective_reader_cfg = dict(
+    input_columns=['instruction', 'formatted_checklists', 'num_checklist'],
+    output_column='judgement',
+    )
+
+hellobench_categories = [
+    'open_ended_qa',
+    'summarization',
+    'chat',
+    'text_completion',
+    'heuristic_text_generation',
+]
+data_path ='data/HelloBench'
+
+hellobench_datasets = []
+
+for category_name in hellobench_categories:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{instruction}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=16384),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt=system_prompt)
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt = user_prompt
+                        ),
+                    ]),
+            ),
+            dict_postprocessor=dict(type=hellobench_postprocess,),
+        ),
+        pred_role='BOT',
+    )
+
+    hellobench_datasets.append(
+        dict(
+            abbr=f'HelloBench-{category_name}',
+            type=HelloBenchDataset,
+            path=data_path,
+            category_name=category_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/configs/datasets/subjective/multiround/mtbench101_judge.py
+++ b/configs/datasets/subjective/multiround/mtbench101_judge.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
+from opencompass.datasets import MTBench101Dataset
+from opencompass.summarizers import MTBench101Summarizer

 subjective_reader_cfg = dict(
    input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
@ -45,7 +46,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=mtbench101_postprocess),
        ),
        pred_role='BOT',
    )
@ -60,4 +60,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=MTBench101Summarizer, judge_type='single')
        ))
--- a/configs/datasets/subjective/multiround/mtbench101_judge_new.py
+++ b/configs/datasets/subjective/multiround/mtbench101_judge_new.py
@ -0,0 +1,63 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
+
+subjective_reader_cfg = dict(
+    input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'mtbench101',
+]
+data_path ='data/subjective/'
+
+mtbench101_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template="""{dialogue}""",
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt='{system_prompt}')
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{prompt_template}'
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=mtbench101_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    mtbench101_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=MTBench101Dataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py
+++ b/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import MTBenchDataset, mtbench_postprocess
+from opencompass.datasets import MTBenchDataset
+from opencompass.summarizers import MTBenchSummarizer

 subjective_reader_cfg = dict(
    input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
@ -47,7 +48,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=mtbench_postprocess),
        ),
        pred_role='BOT',
    )
@ -62,4 +62,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=MTBenchSummarizer, judge_type='single')
        ))
--- a/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp_new.py
+++ b/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp_new.py
@ -0,0 +1,65 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import MTBenchDataset, mtbench_postprocess
+
+subjective_reader_cfg = dict(
+    input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'mtbench_0.0','mtbench_0.1','mtbench_0.7'
+]
+data_path ='data/subjective/mtbench'
+
+mtbench_datasets = []
+
+for _name in subjective_all_sets:
+    temperature = float(_name.split('_')[1])
+    do_sample = False if temperature == 0.0 else True
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template="""{dialogue}""",
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=1024, temperature=temperature, do_sample=do_sample,infer_mode='every'),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt='{system_prompt}')
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{prompt_template}'
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=mtbench_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    mtbench_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=MTBenchDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
+++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import WildBenchDataset, wildbench_postprocess
+from opencompass.datasets import WildBenchDataset
 from opencompass.summarizers import WildBenchPairSummarizer

 subjective_reader_cfg = dict(
@ -30,7 +30,6 @@ subjective_eval_cfg = dict(
            type=PromptTemplate,
            template="""{prompt}"""
        ),
-        dict_postprocessor=dict(type=wildbench_postprocess),
    ),
    pred_role='BOT',
 )
@ -63,4 +62,5 @@ wildbench_datasets.append(
        mode='m2n', # m个模型 与 n个模型进行对战
        infer_order='random',
        base_models = [llama_2_70b, gpt4, claude],
+        summarizer = dict(type=WildBenchPairSummarizer),
    ))
--- a/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py
+++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py
@ -0,0 +1,66 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import WildBenchDataset, wildbench_postprocess
+from opencompass.summarizers import WildBenchPairSummarizer
+
+subjective_reader_cfg = dict(
+    input_columns=['dialogue', 'prompt'],
+    output_column='judge',
+    )
+
+
+data_path ='./data/subjective/WildBench/wildbench.jsonl'
+
+wildbench_datasets = []
+subjective_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template="""{dialogue}"""
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'),
+    )
+
+subjective_eval_cfg = dict(
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template="""{prompt}"""
+        ),
+        dict_postprocessor=dict(type=wildbench_postprocess),
+    ),
+    pred_role='BOT',
+)
+gpt4 = dict(
+    abbr='gpt4-turbo',
+)
+
+claude = dict(
+    abbr='HaiKu',
+)
+
+llama_2_70b = dict(
+    abbr='llama-2-70b-chat-hf',
+)
+
+wildbench_datasets.append(
+    dict(
+        abbr='wildbench',
+        type=WildBenchDataset,
+        path=data_path,
+        eval_mode='pair',
+        reader_cfg=subjective_reader_cfg,
+        infer_cfg=subjective_infer_cfg,
+        eval_cfg=subjective_eval_cfg,
+        given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
+                {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
+                {'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
+                {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
+                {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
+        mode='m2n', # m个模型 与 n个模型进行对战
+        infer_order='random',
+        base_models = [llama_2_70b, gpt4, claude],
+    ))
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
+from opencompass.summarizers import AlignmentBenchSummarizer
+from opencompass.datasets import AlignmentBenchDataset

 subjective_reader_cfg = dict(
    input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
        ),
        pred_role='BOT',
    )
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
        ))
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py
@ -0,0 +1,66 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'capability', 'critiquellm_prefix'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'alignment_bench',
+]
+data_path ='data/subjective/alignment_bench'
+
+alignment_bench_config_path = 'data/subjective/alignment_bench/config'
+alignment_bench_config_name = 'multi-dimension'
+
+alignbench_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
+        ),
+        pred_role='BOT',
+    )
+
+    alignbench_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=AlignmentBenchDataset,
+            path=data_path,
+            name=_name,
+            alignment_bench_config_path=alignment_bench_config_path,
+            alignment_bench_config_name=alignment_bench_config_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
+from opencompass.summarizers import AlignmentBenchSummarizer
+from opencompass.datasets import AlignmentBenchDataset

 subjective_reader_cfg = dict(
    input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
        ),
        pred_role='BOT',
    )
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
        ))
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py
@ -0,0 +1,66 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'capability', 'critiquellm_prefix'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'alignment_bench_v1_1', # Changed to Alignbench_v1_1 since 06/15/2024, refer to https://github.com/THUDM/AlignBench
+]
+data_path ='data/subjective/alignment_bench'
+
+alignment_bench_config_path = 'data/subjective/alignment_bench/config'
+alignment_bench_config_name = 'multi-dimension'
+
+alignbench_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
+        ),
+        pred_role='BOT',
+    )
+
+    alignbench_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=AlignmentBenchDataset,
+            path=data_path,
+            name=_name,
+            alignment_bench_config_path=alignment_bench_config_path,
+            alignment_bench_config_name=alignment_bench_config_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
+++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
+from opencompass.datasets import SubjectiveCmpDataset
+from opencompass.summarizers import AlpacaSummarizer
 from mmengine.config import read_base

 subjective_reader_cfg = dict(
@ -94,7 +95,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=alpacaeval_postprocess),
        ),
        pred_role='BOT',
    )
@ -102,7 +102,7 @@ for _name in subjective_all_sets:
    alpacav2_datasets.append(
        dict(
            abbr=f'{_name}',
-            type=AlpacaEvalDataset,
+            type=SubjectiveCmpDataset,
            path='./data/subjective/alpaca_eval',
            name=_name,
            reader_cfg=subjective_reader_cfg,
@ -111,5 +111,6 @@ for _name in subjective_all_sets:
            mode='m2n',
            infer_order='random',
            base_models=gpt4,
-            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
+            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}],
+            summarizer=dict(type=AlpacaSummarizer, judge_type='v2'),
        ))
--- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py
+++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py
@ -0,0 +1,115 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'alpaca_eval',
+]
+
+
+alpacav2_datasets = []
+
+gpt4_prompt = """
+I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
+
+## Instruction
+
+{
+    "instruction": "{question}",
+}
+
+## Model Outputs
+
+Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
+
+{
+    {
+        "model_identifier": "m",
+        "output": "{prediction}"
+    },
+    {
+        "model_identifier": "M",
+        "output": "{prediction2}"
+    }
+}
+
+## Task
+
+Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
+
+## Best Model Identifier
+"""
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+gpt4 = [dict(
+    abbr='gpt4-turbo',
+)]
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.')
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = gpt4_prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=alpacaeval_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    alpacav2_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=AlpacaEvalDataset,
+            path='./data/subjective/alpaca_eval',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='random',
+            base_models=gpt4,
+            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
+        ))
--- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py
+++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
+from opencompass.datasets import ArenaHardDataset
+from opencompass.summarizers import ArenaHardSummarizer
 from mmengine.config import read_base

 subjective_reader_cfg = dict(
@ -59,7 +60,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=arenahard_postprocess),
        ),
        pred_role='BOT',
    )
@ -76,5 +76,6 @@ for _name in subjective_all_sets:
            mode='m2n',
            infer_order='double',
            base_models=gpt4,
-            given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
+            given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}],
+            summarizer = dict(type=ArenaHardSummarizer),
        ))
--- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py
+++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py
@ -0,0 +1,80 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'arenahard',
+]
+
+
+arenahard_datasets = []
+
+system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
+
+judge_prompt = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{prediction}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{prediction2}\n<|The End of Assistant B's Answer|>"
+
+gpt4 = [dict(
+    abbr='gpt4-0314',
+)]
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt=system_prompt)
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = judge_prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=arenahard_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    arenahard_datasets.append(
+        dict(
+            abbr='arenahard',
+            type=ArenaHardDataset,
+            path='./data/subjective/arena_hard',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='double',
+            base_models=gpt4,
+            given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
+        ))
--- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py
+++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
+from opencompass.datasets import CompassArenaDataset
 from opencompass.summarizers import CompassArenaSummarizer

 subjective_reader_cfg = dict(
@ -15,23 +15,29 @@ data_path ='data/subjective/compass_arena'
 compassarena_datasets = []

 base_prompt = """
+
 [回答1开始]
 {prediction}
 [回答1结束]
+
 [回答2开始]
 {prediction2}
 [回答2结束]
+
 根据评分要求，在以下 3 个选项中做出选择:
 A. 回答1更好
 B. 回答2更好
 C. 回答1、2平局
 并提供你的解释原因。
+
 如果你认为回答1更好，你的输出应形如：
 选择：A
 原因：blahblah blahblah\n
+
 如果你认为回答2更好，你的输出应形如：
 选择：B
 原因：blahblah blahblah\n
+
 如果你认为回答1、2打成平手，你的输出应形如：
 选择：C
 原因：blahblah blahblah\n
@ -43,8 +49,10 @@ knowledge_prompt = """
 1. 更好的回答能与参考答案吻合或表明参考答案的意思。
 2. 在都准确答对问题的前提下，更好的回答能对知识点进行额外补充，且补充的知识准确无误。
 3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+
 [用户问题]
 {question}
+
 [参考答案]
 {ref}
 """ + base_prompt
@ -56,8 +64,10 @@ language_prompt = """
 1. 在有明确的参考答案的情况下，越贴近参考答案或表明了参考答案的意思的回答越好。
 2. 更好的回答在语言表达上更流畅，更加符合与人类对话的习惯，包括语气、情调等
 3. 在都准确答对问题的前提下，更好的回答能进行额外补充，且补充的内容准确无误。
+
 [用户问题]
 {question}
+
 [参考答案]
 {ref}
 """ + base_prompt
@ -69,8 +79,10 @@ math_prompt = """
 1. 更好的回答的答案能和参考答案一致。
 2. 若两个回答的答案都与参考答案不一致，则更好的回答的推理过程应更加合理。
 3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+
 [用户问题]
 {question}
+
 [参考答案]
 {ref}
 """ + base_prompt
@ -83,6 +95,7 @@ creation_prompt = """
 1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
 2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
 3. 好的回答必须具有创造性的词语和表达丰富度
+
 [用户问题]
 {question}
 """ + base_prompt
@ -120,7 +133,6 @@ for _name, _prompt in sub_map.items():
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
        ),
        pred_role='BOT',
    )
@ -137,6 +149,6 @@ for _name, _prompt in sub_map.items():
            mode='m2n',
            infer_order='double',
            base_models=gpt4,
-            # summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
+            summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
        ))
--- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py
+++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py
@ -0,0 +1,142 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
+from opencompass.summarizers import CompassArenaSummarizer
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'ref'],
+    output_column='judge',
+    )
+
+data_path ='data/subjective/compass_arena'
+
+compassarena_datasets = []
+
+base_prompt = """
+[回答1开始]
+{prediction}
+[回答1结束]
+[回答2开始]
+{prediction2}
+[回答2结束]
+根据评分要求，在以下 3 个选项中做出选择:
+A. 回答1更好
+B. 回答2更好
+C. 回答1、2平局
+并提供你的解释原因。
+如果你认为回答1更好，你的输出应形如：
+选择：A
+原因：blahblah blahblah\n
+如果你认为回答2更好，你的输出应形如：
+选择：B
+原因：blahblah blahblah\n
+如果你认为回答1、2打成平手，你的输出应形如：
+选择：C
+原因：blahblah blahblah\n
+"""
+
+knowledge_prompt = """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答能与参考答案吻合或表明参考答案的意思。
+2. 在都准确答对问题的前提下，更好的回答能对知识点进行额外补充，且补充的知识准确无误。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+
+language_prompt = """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 在有明确的参考答案的情况下，越贴近参考答案或表明了参考答案的意思的回答越好。
+2. 更好的回答在语言表达上更流畅，更加符合与人类对话的习惯，包括语气、情调等
+3. 在都准确答对问题的前提下，更好的回答能进行额外补充，且补充的内容准确无误。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+
+math_prompt = """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答的答案能和参考答案一致。
+2. 若两个回答的答案都与参考答案不一致，则更好的回答的推理过程应更加合理。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+reason_prompt = math_prompt
+
+creation_prompt = """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
+2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
+3. 好的回答必须具有创造性的词语和表达丰富度
+[用户问题]
+{question}
+""" + base_prompt
+
+sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
+
+gpt4 = [dict(
+    abbr='gpt4-turbo',
+)]
+
+for _name, _prompt in sub_map.items():
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = _prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
+        ),
+        pred_role='BOT',
+    )
+
+    compassarena_datasets.append(
+        dict(
+            abbr=f'compassarena_{_name}',
+            type=CompassArenaDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='double',
+            base_models=gpt4,
+            # summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
+            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
+        ))
--- a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import FofoDataset, fofo_postprocess
+from opencompass.datasets import FofoDataset
+from opencompass.summarizers import FofoSummarizer
 from mmengine.config import read_base

 subjective_reader_cfg = dict(
@ -112,7 +113,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=fofo_postprocess),
        ),
        pred_role='BOT',
    )
@ -127,4 +127,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=FofoSummarizer, judge_type='general')
        ))
--- a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py
@ -0,0 +1,130 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import FofoDataset, fofo_postprocess
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'fofo_test_prompts', 'fofo_test_prompts_cn',
+]
+
+base_prompt_en = """
+I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
+
+Here is the prompt:
+{
+    "instruction": "{question}",
+}
+
+Here are the outputs of the models:
+[
+    {
+        "model": "model",
+        "answer": "{prediction}"
+    },
+]
+
+Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
+```json
+[
+    {
+        'model': <model-name>,
+        'format_correctness': <correctness>,
+        'reasons': <reasons-of-format-correctness>
+    }
+]
+```
+
+Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
+"""
+
+
+base_prompt_cn = """
+我希望你创建一个排行榜，用于评估来自各种大型语言模型的回答格式的正确性。为了完成这个任务，你将需要分析给模型的文本提示以及它们对应的回答。具体来说，请确保你的评估输出正确地格式化为JSON字符串。我将为此提供提示和回答。
+以下是提示内容：
+{
+    "instruction": "{question}",
+}
+以下是模型的输出结果：
+[
+    {
+        "model": "model",
+        "answer": "{prediction}"
+    },
+]
+请通过检查模型回答是否符合提示中声明的格式规范来评估模型回答的格式。进行彻底的格式检查，并提供格式正确或错误的详细解释。你的反馈应包括模型的名称，接着是格式正确性的状态，用'1'表示正确，'0'表示错误。将你的推理以每个评估模型的单个字符串中的 bullet 点形式呈现。换句话说，你应该生成以下输出：
+```json
+[
+    {
+        'model': <模型名称>,
+        'format_correctness': <正确性>,
+        'reasons': <格式正确性的原因>
+    }
+]
+```
+请注意，你的回答应是一个正确格式化的JSON字符串，不应包含任何额外的内容。我们将在Python中直接将其作为JSON字符串加载。
+"""
+
+
+fofo_datasets = []
+
+for _name in subjective_all_sets:
+    if '_cn' in _name:
+        base_prompt = base_prompt_cn
+    else:
+        base_prompt = base_prompt_en
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = base_prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=fofo_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    fofo_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=FofoDataset,
+            path='./data/subjective/fofo',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/opencompass/configs/datasets/subjective/fofo/fofo_judge.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_judge.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import FofoDataset, fofo_postprocess
+from opencompass.datasets import FofoDataset
+from opencompass.summarizers import FofoSummarizer
 from mmengine.config import read_base

 subjective_reader_cfg = dict(
@ -80,7 +81,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=fofo_postprocess),
        ),
        pred_role='BOT',
    )
@ -95,4 +95,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=FofoSummarizer, judge_type='general')
        ))
--- a/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py
@ -0,0 +1,98 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import FofoDataset, fofo_postprocess
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'fofo_test_prompts'
+]
+
+base_prompt = """
+I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
+
+Here is the prompt:
+{
+    "instruction": "{question}",
+}
+
+Here are the outputs of the models:
+[
+    {
+        "model": "model",
+        "answer": "{prediction}"
+    },
+]
+
+Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
+```json
+[
+    {
+        'model': <model-name>,
+        'format_correctness': <correctness>,
+        'reasons': <reasons-of-format-correctness>
+    }
+]
+```
+
+Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
+"""
+
+fofo_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = base_prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=fofo_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    fofo_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=FofoDataset,
+            path='./data/subjective/fofo',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
+++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import FollowBenchDataset, followbench_postprocess
+from opencompass.datasets import FollowBenchDataset
+from opencompass.summarizers import FollowBenchSummarizer

 subjective_reader_cfg = dict(
    input_columns=['instruction', 'judge_prompt',],
@ -43,7 +44,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=followbench_postprocess),
        ),
        pred_role='BOT',
    )
@ -59,4 +59,5 @@ for _name in subjective_all_sets:
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
+            summarizer = dict(type=FollowBenchSummarizer,)
        ))
--- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py
+++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py
@ -0,0 +1,62 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import FollowBenchDataset, followbench_postprocess
+
+subjective_reader_cfg = dict(
+    input_columns=['instruction', 'judge_prompt',],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'followbench_llmeval_cn', 'followbench_llmeval_en',
+]
+data_path ='data/subjective/followbench/converted_data'
+
+followbench_llmeval_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{instruction}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{judge_prompt}'
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=followbench_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    followbench_llmeval_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=FollowBenchDataset,
+            path=data_path,
+            name=_name,
+            mode='singlescore',
+            cate='llm',
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+        ))
--- a/opencompass/configs/datasets/subjective/hellobench/README.md
+++ b/opencompass/configs/datasets/subjective/hellobench/README.md
@ -48,4 +48,4 @@ pip install -e .
 python run.py configs/eval_hellobench.py
 ```

-6. After that, you could find the results in outputs/hellobench/xxx/summary
+6. After that, you could find the results in outputs/hellobench/xxx/summary
--- a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py
+++ b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
+from opencompass.datasets import MTBench101Dataset
+from opencompass.summarizers import MTBench101Summarizer

 subjective_reader_cfg = dict(
    input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
@ -45,7 +46,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=mtbench101_postprocess),
        ),
        pred_role='BOT',
    )
@ -60,4 +60,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=MTBench101Summarizer, judge_type='single')
        ))
--- a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py
+++ b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py
@ -0,0 +1,63 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
+
+subjective_reader_cfg = dict(
+    input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'mtbench101',
+]
+data_path ='data/subjective/'
+
+mtbench101_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template="""{dialogue}""",
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt='{system_prompt}')
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{prompt_template}'
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=mtbench101_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    mtbench101_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=MTBench101Dataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/opencompass/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py
+++ b/opencompass/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py
@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import MTBenchDataset, mtbench_postprocess
+from opencompass.datasets import MTBenchDataset
+from opencompass.summarizers import MTBenchSummarizer

 subjective_reader_cfg = dict(
    input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
@ -47,7 +48,6 @@ for _name in subjective_all_sets:
                    ),
                ]),
            ),
-            dict_postprocessor=dict(type=mtbench_postprocess),
        ),
        pred_role='BOT',
    )
@ -62,4 +62,5 @@ for _name in subjective_all_sets:
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
+            summarizer = dict(type=MTBenchSummarizer, judge_type='single')
        ))
--- a/opencompass/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp_new.py
+++ b/opencompass/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp_new.py
@ -0,0 +1,65 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import MTBenchDataset, mtbench_postprocess
+
+subjective_reader_cfg = dict(
+    input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'mtbench_0.0','mtbench_0.1','mtbench_0.7'
+]
+data_path ='data/subjective/mtbench'
+
+mtbench_datasets = []
+
+for _name in subjective_all_sets:
+    temperature = float(_name.split('_')[1])
+    do_sample = False if temperature == 0.0 else True
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template="""{dialogue}""",
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=1024, temperature=temperature, do_sample=do_sample,infer_mode='every'),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt='{system_prompt}')
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{prompt_template}'
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=mtbench_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    mtbench_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=MTBenchDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
+++ b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets import WildBenchDataset, wildbench_postprocess
+from opencompass.datasets import WildBenchDataset
 from opencompass.summarizers import WildBenchPairSummarizer

 subjective_reader_cfg = dict(
@ -30,7 +30,6 @@ subjective_eval_cfg = dict(
            type=PromptTemplate,
            template="""{prompt}"""
        ),
-        dict_postprocessor=dict(type=wildbench_postprocess),
    ),
    pred_role='BOT',
 )
@ -63,4 +62,5 @@ wildbench_datasets.append(
        mode='m2n', # m个模型 与 n个模型进行对战
        infer_order='random',
        base_models = [llama_2_70b, gpt4, claude],
+        summarizer = dict(type=WildBenchPairSummarizer),
    ))
--- a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py
+++ b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py
@ -0,0 +1,66 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import WildBenchDataset, wildbench_postprocess
+from opencompass.summarizers import WildBenchPairSummarizer
+
+subjective_reader_cfg = dict(
+    input_columns=['dialogue', 'prompt'],
+    output_column='judge',
+    )
+
+
+data_path ='./data/subjective/WildBench/wildbench.jsonl'
+
+wildbench_datasets = []
+subjective_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template="""{dialogue}"""
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'),
+    )
+
+subjective_eval_cfg = dict(
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template="""{prompt}"""
+        ),
+        dict_postprocessor=dict(type=wildbench_postprocess),
+    ),
+    pred_role='BOT',
+)
+gpt4 = dict(
+    abbr='gpt4-turbo',
+)
+
+claude = dict(
+    abbr='HaiKu',
+)
+
+llama_2_70b = dict(
+    abbr='llama-2-70b-chat-hf',
+)
+
+wildbench_datasets.append(
+    dict(
+        abbr='wildbench',
+        type=WildBenchDataset,
+        path=data_path,
+        eval_mode='pair',
+        reader_cfg=subjective_reader_cfg,
+        infer_cfg=subjective_infer_cfg,
+        eval_cfg=subjective_eval_cfg,
+        given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
+                {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
+                {'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
+                {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
+                {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
+        mode='m2n', # m个模型 与 n个模型进行对战
+        infer_order='random',
+        base_models = [llama_2_70b, gpt4, claude],
+    ))
--- a/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
@ -245,7 +245,6 @@ class ChatInferencer(BaseInferencer):
        logger.info('Starting inference process...')
        for datum in tqdm(dataloader, disable=not self.is_main_process):
            chat = datum[0]
-
            if self.infer_mode == 'last':
                self.infer_last(chat, index, output_handler)
            elif self.infer_mode == 'every':