diff --git a/configs/datasets/subjective/compassbench/compassbench_checklist.py b/configs/datasets/subjective/compassbench/compassbench_checklist.py new file mode 100644 index 00000000..7cba0a91 --- /dev/null +++ b/configs/datasets/subjective/compassbench/compassbench_checklist.py @@ -0,0 +1,224 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import CompassBenchCheklistDataset +from mmengine.config import read_base + +subjective_reader_cfg = dict( + input_columns=['question','checklist'], + output_column='judge', + ) + +subjective_all_sets = {'en':['fofo_test_prompts_checklist'], + 'cn':['fofo_test_prompts_cn_checklist']} + +pair_prompt_en = """# Instruction + +You are an expert evaluator. Your task is to evaluate the quality of the \ +responses generated by two AI models. +We will provide you with the user query and a pair of AI-generated \ +responses (Response A and Response B). +You should first read the user query and the conversation history \ +carefully for analyzing the task, and then evaluate the quality of the \ +responses based on and rules provided below. + +# Conversation between User and AI + +## User Query +<|begin_of_query|> + +{question} + +<|end_of_query|> + +## Response A +<|begin_of_response_A|> + +{prediction} + +<|end_of_response_A|> + +## Response B +<|begin_of_response_B|> + +{prediction2} + +<|end_of_response_B|> + +# Evaluation + +## Checklist + +<|begin_of_checklist|> + +{checklist} + +<|end_of_checklist|> + +Please use this checklist to guide your evaluation, but do not limit your \ +assessment to the checklist. + +## Rules + +You should compare the above two responses based on your analysis of the \ +user queries and the conversation history. +You should first write down your analysis and the checklist that you used \ +for the evaluation, and then provide your assessment according to the \ +checklist. +There are five choices to give your final assessment: ["A++", "A+", \ +"A=B", "B+", "B++"], which correspond to the following meanings: + +- `A++`: Response A is much better than Response B. +- `A+`: Response A is only slightly better than Response B. +- `A=B`: Response A and B are of the same quality. Please use this \ +choice sparingly. +- `B+`: Response B is only slightly better than Response A. +- `B++`: Response B is much better than Response A. + +## Output Format +First, please output your analysis for each model response, and \ +then summarize your assessment to three aspects: "reason A=B", \ +"reason A>B", and "reason B>A", and finally make your choice for \ +the final assessment. + +Please provide your evaluation results in the following json \ +format by filling in the placeholders in []: +``` +{ + "analysis of A": "[analysis of Response A]", + "analysis of B": "[analysis of Response B]", + "reason of A=B": "[where Response A and B perform equally well]", + "reason of A>B": "[where Response A is better than Response B]", + "reason of B>A": "[where Response B is better than Response A]", + "choice": "[A++ or A+ or A=B or B+ or B++]", +} +``` +""" + + +pair_prompt_cn = """# 指令 + +您是一位专业评估专家。您的任务是评估两个AI模型生成回答的质量。 +我们将为您提供用户问题及一对AI生成的回答(回答A和回答B)。 +您应当首先仔细阅读用户问题,然后根据以下提供的规则评估回答的质量。 + +# 用户与AI之间的对话 + +## 用户问题 +<|begin_of_query|> + +{question} + +<|end_of_query|> + +## 回答A +<|begin_of_response_A|> + +{prediction} + +<|end_of_response_A|> + +## 回答B +<|begin_of_response_B|> + +{prediction2} + +<|end_of_response_B|> + +# 评估 + +## 检查清单 + +<|begin_of_checklist|> + +{checklist} + +<|end_of_checklist|> + +请参考此检查清单来评估回答的质量,但不要局限于此检查清单。 + +## 规则 + +您应当基于用户查询,分析比较上述两种回答。 +您应当基于检查清单写下您的分析,然后提供您的评价。 +有五个选项供您做出最终评估:["A++", "A+", "A=B", "B+", "B++"],它们对应如下含义: + +- `A++`:回答A远胜于回答B。 +- `A+`:回答A略优于回答B。 +- `A=B`:回答A和回答B质量相同。请谨慎使用此选项。 +- `B+`:回答B略优于回答A。 +- `B++`:回答B远胜于回答A。 + +## 输出格式 +首先,请输出您对每个模型回答的分析, +然后总结您的评估到三个方面:"A=B的理由","A优于B的理由",和 "B优于A的理由", +最后做出您对最终评估的选择。 + +请按照以下json格式提供您的评估结果,通过填充[]中的占位符: +``` +{ + "回答A的分析": "[回答A的分析]", + "回答B的分析": "[回答B的分析]", + "A=B的理由": "[A和B回答差不多的理由]", + "A优于B的理由": "[回答A优于B的理由]", + "B优于A的理由": "[回答B优于A的理由]", + "choice": "[A++ or A+ or A=B or B+ or B++]", +} +``` +""" + +checklist_datasets = [] +gpt4 = [dict( + abbr='gpt4o', +)] +for lan, data_name_list in subjective_all_sets.items(): + if lan == 'en': + pair_prompt = pair_prompt_en + elif lan == 'cn': + pair_prompt = pair_prompt_cn + for _name in data_name_list: + subjective_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='{question}' + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=4096), + ) + + subjective_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt = pair_prompt + ), + ]), + ), + ), + pred_role='BOT', + ) + + checklist_datasets.append( + dict( + abbr=f'{_name}', + type=CompassBenchCheklistDataset, + path='./data/subjective/compassbench_checklist', + name=_name, + reader_cfg=subjective_reader_cfg, + infer_cfg=subjective_infer_cfg, + eval_cfg=subjective_eval_cfg, + mode='m2n', + infer_order='random', + base_models=gpt4, + )) diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py index 4f952125..077db379 100644 --- a/opencompass/datasets/subjective/__init__.py +++ b/opencompass/datasets/subjective/__init__.py @@ -2,6 +2,8 @@ from .alignbench import AlignmentBenchDataset # noqa: F401, F403 from .arena_hard import ArenaHardDataset # noqa: F401, F403 from .compass_arena import CompassArenaDataset # noqa: F401, F403 from .compassbench import CompassBenchDataset # noqa: F401, F403 +from .compassbench_checklist import \ + CompassBenchCheklistDataset # noqa: F401, F403 from .compassbench_control_length_bias import \ CompassBenchControlLengthBiasDataset # noqa: F401, F403 from .corev2 import Corev2Dataset # noqa: F401, F403 diff --git a/opencompass/datasets/subjective/compassbench_checklist.py b/opencompass/datasets/subjective/compassbench_checklist.py new file mode 100644 index 00000000..48c5f738 --- /dev/null +++ b/opencompass/datasets/subjective/compassbench_checklist.py @@ -0,0 +1,37 @@ +# flake8: noqa +import json +import os.path as osp + +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET + +from ..base import BaseDataset + + +@LOAD_DATASET.register_module() +class CompassBenchCheklistDataset(BaseDataset): + + def load(self, path: str, name: str, *args, **kwargs): + filename = osp.join(path, f'{name}.json') + raw_data = [] + with open(filename, 'r', encoding='utf-8') as f: + json_data = json.load(f) + for problem in json_data: + question = problem['instruction'] + checklist_mardkdown = '' + if problem.get('checklist', None): + for checklist_item in problem['checklist']: + checklist_mardkdown += f'- {checklist_item}\n' + raw_data.append({ + 'question': question, + 'checklist': checklist_mardkdown, + 'judge': { + 'category': problem.get('category', None), + 'lan': problem.get('lan', None), + 'id': problem.get('id', None), + 'question': question + } + }) + dataset = Dataset.from_list(raw_data) + return dataset