[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version * fix pip version * support checklist eval * init * add lan * fix typo
2025-05-30 16:03:24 +08:00 · 2024-07-19 16:40:44 +08:00 · 2024-07-19 16:40:44 +08:00 · 1f9f728f22
commit 1f9f728f22
parent f40add2596
3 changed files with 263 additions and 0 deletions
--- a/configs/datasets/subjective/compassbench/compassbench_checklist.py
+++ b/configs/datasets/subjective/compassbench/compassbench_checklist.py
@ -0,0 +1,224 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import CompassBenchCheklistDataset
 from mmengine.config import read_base
 subjective_reader_cfg = dict(
    input_columns=['question','checklist'],
    output_column='judge',
    )
 subjective_all_sets = {'en':['fofo_test_prompts_checklist'],
                       'cn':['fofo_test_prompts_cn_checklist']}
 pair_prompt_en = """# Instruction
 You are an expert evaluator. Your task is to evaluate the quality of the \
 responses generated by two AI models.
 We will provide you with the user query and a pair of AI-generated \
 responses (Response A and Response B).
 You should first read the user query and the conversation history \
 carefully for analyzing the task, and then evaluate the quality of the \
 responses based on and rules provided below.
 # Conversation between User and AI
 ## User Query
 <|begin_of_query|>
 {question}
 <|end_of_query|>
 ## Response A
 <|begin_of_response_A|>
 {prediction}
 <|end_of_response_A|>
 ## Response B
 <|begin_of_response_B|>
 {prediction2}
 <|end_of_response_B|>
 # Evaluation
 ## Checklist
 <|begin_of_checklist|>
 {checklist}
 <|end_of_checklist|>
 Please use this checklist to guide your evaluation, but do not limit your \
 assessment to the checklist.
 ## Rules
 You should compare the above two responses based on your analysis of the \
 user queries and the conversation history.
 You should first write down your analysis and the checklist that you used \
 for the evaluation, and then provide your assessment according to the \
 checklist.
 There are five choices to give your final assessment: ["A++", "A+", \
 "A=B", "B+", "B++"], which correspond to the following meanings:
 - `A++`: Response A is much better than Response B.
 - `A+`: Response A is only slightly better than Response B.
 - `A=B`: Response A and B are of the same quality. Please use this \
 choice sparingly.
 - `B+`: Response B is only slightly better than Response A.
 - `B++`: Response B is much better than Response A.
 ## Output Format
 First, please output your analysis for each model response, and \
 then summarize your assessment to three aspects: "reason A=B", \
 "reason A>B", and "reason B>A", and finally make your choice for \
 the final assessment.
 Please provide your evaluation results in the following json \
 format by filling in the placeholders in []:
 ```
 {
    "analysis of A": "[analysis of Response A]",
    "analysis of B": "[analysis of Response B]",
    "reason of A=B": "[where Response A and B perform equally well]",
    "reason of A>B": "[where Response A is better than Response B]",
    "reason of B>A": "[where Response B is better than Response A]",
    "choice": "[A++ or A+ or A=B or B+ or B++]",
 }
 ```
 """
 pair_prompt_cn = """# 指令
 您是一位专业评估专家。您的任务是评估两个AI模型生成回答的质量。
 我们将为您提供用户问题及一对AI生成的回答（回答A和回答B）。
 您应当首先仔细阅读用户问题，然后根据以下提供的规则评估回答的质量。
 # 用户与AI之间的对话
 ## 用户问题
 <|begin_of_query|>
 {question}
 <|end_of_query|>
 ## 回答A
 <|begin_of_response_A|>
 {prediction}
 <|end_of_response_A|>
 ## 回答B
 <|begin_of_response_B|>
 {prediction2}
 <|end_of_response_B|>
 # 评估
 ## 检查清单
 <|begin_of_checklist|>
 {checklist}
 <|end_of_checklist|>
 请参考此检查清单来评估回答的质量，但不要局限于此检查清单。
 ## 规则
 您应当基于用户查询，分析比较上述两种回答。
 您应当基于检查清单写下您的分析，然后提供您的评价。
 有五个选项供您做出最终评估：["A++", "A+", "A=B", "B+", "B++"]，它们对应如下含义：
 - `A++`：回答A远胜于回答B。
 - `A+`：回答A略优于回答B。
 - `A=B`：回答A和回答B质量相同。请谨慎使用此选项。
 - `B+`：回答B略优于回答A。
 - `B++`：回答B远胜于回答A。
 ## 输出格式
 首先，请输出您对每个模型回答的分析，
 然后总结您的评估到三个方面："A=B的理由"，"A优于B的理由"，和 "B优于A的理由"，
 最后做出您对最终评估的选择。
 请按照以下json格式提供您的评估结果，通过填充[]中的占位符：
 ```
 {
    "回答A的分析": "[回答A的分析]",
    "回答B的分析": "[回答B的分析]",
    "A=B的理由": "[A和B回答差不多的理由]",
    "A优于B的理由": "[回答A优于B的理由]",
    "B优于A的理由": "[回答B优于A的理由]",
    "choice": "[A++ or A+ or A=B or B+ or B++]",
 }
 ```
 """
 checklist_datasets = []
 gpt4 = [dict(
    abbr='gpt4o',
 )]
 for lan, data_name_list in subjective_all_sets.items():
    if lan == 'en':
        pair_prompt = pair_prompt_en
    elif lan == 'cn':
        pair_prompt = pair_prompt_cn
    for _name in data_name_list:
        subjective_infer_cfg = dict(
                prompt_template=dict(
                    type=PromptTemplate,
                    template=dict(round=[
                        dict(
                            role='HUMAN',
                            prompt='{question}'
                        ),
                    ]),
                ),
                retriever=dict(type=ZeroRetriever),
                inferencer=dict(type=GenInferencer, max_out_len=4096),
            )
        subjective_eval_cfg = dict(
            evaluator=dict(
                type=LMEvaluator,
                prompt_template=dict(
                    type=PromptTemplate,
                    template=dict(
                    round=[
                    dict(
                        role='HUMAN',
                        prompt = pair_prompt
                    ),
                ]),
                ),
            ),
            pred_role='BOT',
        )
        checklist_datasets.append(
            dict(
                abbr=f'{_name}',
                type=CompassBenchCheklistDataset,
                path='./data/subjective/compassbench_checklist',
                name=_name,
                reader_cfg=subjective_reader_cfg,
                infer_cfg=subjective_infer_cfg,
                eval_cfg=subjective_eval_cfg,
                mode='m2n',
                infer_order='random',
                base_models=gpt4,
            ))
--- a/opencompass/datasets/subjective/init.py
+++ b/opencompass/datasets/subjective/init.py
@ -2,6 +2,8 @@ from .alignbench import AlignmentBenchDataset  # noqa: F401, F403
 from .arena_hard import ArenaHardDataset  # noqa: F401, F403
 from .compass_arena import CompassArenaDataset  # noqa: F401, F403
 from .compassbench import CompassBenchDataset  # noqa: F401, F403
 from .compassbench_checklist import \
    CompassBenchCheklistDataset  # noqa: F401, F403
 from .compassbench_control_length_bias import \
    CompassBenchControlLengthBiasDataset  # noqa: F401, F403
 from .corev2 import Corev2Dataset  # noqa: F401, F403
--- a/opencompass/datasets/subjective/compassbench_checklist.py
+++ b/opencompass/datasets/subjective/compassbench_checklist.py
@ -0,0 +1,37 @@
 # flake8: noqa
 import json
 import os.path as osp
 from datasets import Dataset
 from opencompass.registry import LOAD_DATASET
 from ..base import BaseDataset
@LOAD_DATASET.register_module()
 class CompassBenchCheklistDataset(BaseDataset):
    def load(self, path: str, name: str, *args, **kwargs):
        filename = osp.join(path, f'{name}.json')
        raw_data = []
        with open(filename, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
            for problem in json_data:
                question = problem['instruction']
                checklist_mardkdown = ''
                if problem.get('checklist', None):
                    for checklist_item in problem['checklist']:
                        checklist_mardkdown += f'- {checklist_item}\n'
                raw_data.append({
                    'question': question,
                    'checklist': checklist_mardkdown,
                    'judge': {
                        'category': problem.get('category', None),
                        'lan': problem.get('lan', None),
                        'id': problem.get('id', None),
                        'question': question
                    }
                })
        dataset = Dataset.from_list(raw_data)
        return dataset