[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version * fix pip version * support checklist eval * init * add lan * fix typo
2025-05-30 16:03:24 +08:00 · 2024-07-19 16:40:44 +08:00 · 2024-07-19 16:40:44 +08:00 · 1f9f728f22
commit 1f9f728f22
parent f40add2596
3 changed files with 263 additions and 0 deletions
--- a/configs/datasets/subjective/compassbench/compassbench_checklist.py
+++ b/configs/datasets/subjective/compassbench/compassbench_checklist.py
@ -0,0 +1,224 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CompassBenchCheklistDataset
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question','checklist'],
+    output_column='judge',
+    )
+
+subjective_all_sets = {'en':['fofo_test_prompts_checklist'],
+                       'cn':['fofo_test_prompts_cn_checklist']}
+
+pair_prompt_en = """# Instruction
+
+You are an expert evaluator. Your task is to evaluate the quality of the \
+responses generated by two AI models.
+We will provide you with the user query and a pair of AI-generated \
+responses (Response A and Response B).
+You should first read the user query and the conversation history \
+carefully for analyzing the task, and then evaluate the quality of the \
+responses based on and rules provided below.
+
+# Conversation between User and AI
+
+## User Query
+<|begin_of_query|>
+
+{question}
+
+<|end_of_query|>
+
+## Response A
+<|begin_of_response_A|>
+
+{prediction}
+
+<|end_of_response_A|>
+
+## Response B
+<|begin_of_response_B|>
+
+{prediction2}
+
+<|end_of_response_B|>
+
+# Evaluation
+
+## Checklist
+
+<|begin_of_checklist|>
+
+{checklist}
+
+<|end_of_checklist|>
+
+Please use this checklist to guide your evaluation, but do not limit your \
+assessment to the checklist.
+
+## Rules
+
+You should compare the above two responses based on your analysis of the \
+user queries and the conversation history.
+You should first write down your analysis and the checklist that you used \
+for the evaluation, and then provide your assessment according to the \
+checklist.
+There are five choices to give your final assessment: ["A++", "A+", \
+"A=B", "B+", "B++"], which correspond to the following meanings:
+
+- `A++`: Response A is much better than Response B.
+- `A+`: Response A is only slightly better than Response B.
+- `A=B`: Response A and B are of the same quality. Please use this \
+choice sparingly.
+- `B+`: Response B is only slightly better than Response A.
+- `B++`: Response B is much better than Response A.
+
+## Output Format
+First, please output your analysis for each model response, and \
+then summarize your assessment to three aspects: "reason A=B", \
+"reason A>B", and "reason B>A", and finally make your choice for \
+the final assessment.
+
+Please provide your evaluation results in the following json \
+format by filling in the placeholders in []:
+```
+{
+    "analysis of A": "[analysis of Response A]",
+    "analysis of B": "[analysis of Response B]",
+    "reason of A=B": "[where Response A and B perform equally well]",
+    "reason of A>B": "[where Response A is better than Response B]",
+    "reason of B>A": "[where Response B is better than Response A]",
+    "choice": "[A++ or A+ or A=B or B+ or B++]",
+}
+```
+"""
+
+
+pair_prompt_cn = """# 指令
+
+您是一位专业评估专家。您的任务是评估两个AI模型生成回答的质量。
+我们将为您提供用户问题及一对AI生成的回答（回答A和回答B）。
+您应当首先仔细阅读用户问题，然后根据以下提供的规则评估回答的质量。
+
+# 用户与AI之间的对话
+
+## 用户问题
+<|begin_of_query|>
+
+{question}
+
+<|end_of_query|>
+
+## 回答A
+<|begin_of_response_A|>
+
+{prediction}
+
+<|end_of_response_A|>
+
+## 回答B
+<|begin_of_response_B|>
+
+{prediction2}
+
+<|end_of_response_B|>
+
+# 评估
+
+## 检查清单
+
+<|begin_of_checklist|>
+
+{checklist}
+
+<|end_of_checklist|>
+
+请参考此检查清单来评估回答的质量，但不要局限于此检查清单。
+
+## 规则
+
+您应当基于用户查询，分析比较上述两种回答。
+您应当基于检查清单写下您的分析，然后提供您的评价。
+有五个选项供您做出最终评估：["A++", "A+", "A=B", "B+", "B++"]，它们对应如下含义：
+
+- `A++`：回答A远胜于回答B。
+- `A+`：回答A略优于回答B。
+- `A=B`：回答A和回答B质量相同。请谨慎使用此选项。
+- `B+`：回答B略优于回答A。
+- `B++`：回答B远胜于回答A。
+
+## 输出格式
+首先，请输出您对每个模型回答的分析，
+然后总结您的评估到三个方面："A=B的理由"，"A优于B的理由"，和 "B优于A的理由"，
+最后做出您对最终评估的选择。
+
+请按照以下json格式提供您的评估结果，通过填充[]中的占位符：
+```
+{
+    "回答A的分析": "[回答A的分析]",
+    "回答B的分析": "[回答B的分析]",
+    "A=B的理由": "[A和B回答差不多的理由]",
+    "A优于B的理由": "[回答A优于B的理由]",
+    "B优于A的理由": "[回答B优于A的理由]",
+    "choice": "[A++ or A+ or A=B or B+ or B++]",
+}
+```
+"""
+
+checklist_datasets = []
+gpt4 = [dict(
+    abbr='gpt4o',
+)]
+for lan, data_name_list in subjective_all_sets.items():
+    if lan == 'en':
+        pair_prompt = pair_prompt_en
+    elif lan == 'cn':
+        pair_prompt = pair_prompt_cn
+    for _name in data_name_list:
+        subjective_infer_cfg = dict(
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(round=[
+                        dict(
+                            role='HUMAN',
+                            prompt='{question}'
+                        ),
+                    ]),
+                ),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=GenInferencer, max_out_len=4096),
+            )
+
+        subjective_eval_cfg = dict(
+            evaluator=dict(
+                type=LMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = pair_prompt
+                    ),
+                ]),
+                ),
+            ),
+            pred_role='BOT',
+        )
+
+        checklist_datasets.append(
+            dict(
+                abbr=f'{_name}',
+                type=CompassBenchCheklistDataset,
+                path='./data/subjective/compassbench_checklist',
+                name=_name,
+                reader_cfg=subjective_reader_cfg,
+                infer_cfg=subjective_infer_cfg,
+                eval_cfg=subjective_eval_cfg,
+                mode='m2n',
+                infer_order='random',
+                base_models=gpt4,
+            ))
--- a/opencompass/datasets/subjective/init.py
+++ b/opencompass/datasets/subjective/init.py
@ -2,6 +2,8 @@ from .alignbench import AlignmentBenchDataset  # noqa: F401, F403
 from .arena_hard import ArenaHardDataset  # noqa: F401, F403
 from .compass_arena import CompassArenaDataset  # noqa: F401, F403
 from .compassbench import CompassBenchDataset  # noqa: F401, F403
+from .compassbench_checklist import \
+    CompassBenchCheklistDataset  # noqa: F401, F403
 from .compassbench_control_length_bias import \
    CompassBenchControlLengthBiasDataset  # noqa: F401, F403
 from .corev2 import Corev2Dataset  # noqa: F401, F403
--- a/opencompass/datasets/subjective/compassbench_checklist.py
+++ b/opencompass/datasets/subjective/compassbench_checklist.py
@ -0,0 +1,37 @@
+# flake8: noqa
+import json
+import os.path as osp
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class CompassBenchCheklistDataset(BaseDataset):
+
+    def load(self, path: str, name: str, *args, **kwargs):
+        filename = osp.join(path, f'{name}.json')
+        raw_data = []
+        with open(filename, 'r', encoding='utf-8') as f:
+            json_data = json.load(f)
+            for problem in json_data:
+                question = problem['instruction']
+                checklist_mardkdown = ''
+                if problem.get('checklist', None):
+                    for checklist_item in problem['checklist']:
+                        checklist_mardkdown += f'- {checklist_item}\n'
+                raw_data.append({
+                    'question': question,
+                    'checklist': checklist_mardkdown,
+                    'judge': {
+                        'category': problem.get('category', None),
+                        'lan': problem.get('lan', None),
+                        'id': problem.get('id', None),
+                        'question': question
+                    }
+                })
+        dataset = Dataset.from_list(raw_data)
+        return dataset