[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version

* fix pip version

* support checklist eval

* init

* add lan

* fix typo
This commit is contained in:
bittersweet1999 2024-07-19 16:40:44 +08:00 committed by GitHub
parent f40add2596
commit 1f9f728f22
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 263 additions and 0 deletions

View File

@ -0,0 +1,224 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassBenchCheklistDataset
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question','checklist'],
output_column='judge',
)
subjective_all_sets = {'en':['fofo_test_prompts_checklist'],
'cn':['fofo_test_prompts_cn_checklist']}
pair_prompt_en = """# Instruction
You are an expert evaluator. Your task is to evaluate the quality of the \
responses generated by two AI models.
We will provide you with the user query and a pair of AI-generated \
responses (Response A and Response B).
You should first read the user query and the conversation history \
carefully for analyzing the task, and then evaluate the quality of the \
responses based on and rules provided below.
# Conversation between User and AI
## User Query
<|begin_of_query|>
{question}
<|end_of_query|>
## Response A
<|begin_of_response_A|>
{prediction}
<|end_of_response_A|>
## Response B
<|begin_of_response_B|>
{prediction2}
<|end_of_response_B|>
# Evaluation
## Checklist
<|begin_of_checklist|>
{checklist}
<|end_of_checklist|>
Please use this checklist to guide your evaluation, but do not limit your \
assessment to the checklist.
## Rules
You should compare the above two responses based on your analysis of the \
user queries and the conversation history.
You should first write down your analysis and the checklist that you used \
for the evaluation, and then provide your assessment according to the \
checklist.
There are five choices to give your final assessment: ["A++", "A+", \
"A=B", "B+", "B++"], which correspond to the following meanings:
- `A++`: Response A is much better than Response B.
- `A+`: Response A is only slightly better than Response B.
- `A=B`: Response A and B are of the same quality. Please use this \
choice sparingly.
- `B+`: Response B is only slightly better than Response A.
- `B++`: Response B is much better than Response A.
## Output Format
First, please output your analysis for each model response, and \
then summarize your assessment to three aspects: "reason A=B", \
"reason A>B", and "reason B>A", and finally make your choice for \
the final assessment.
Please provide your evaluation results in the following json \
format by filling in the placeholders in []:
```
{
"analysis of A": "[analysis of Response A]",
"analysis of B": "[analysis of Response B]",
"reason of A=B": "[where Response A and B perform equally well]",
"reason of A>B": "[where Response A is better than Response B]",
"reason of B>A": "[where Response B is better than Response A]",
"choice": "[A++ or A+ or A=B or B+ or B++]",
}
```
"""
pair_prompt_cn = """# 指令
您是一位专业评估专家您的任务是评估两个AI模型生成回答的质量
我们将为您提供用户问题及一对AI生成的回答回答A和回答B
您应当首先仔细阅读用户问题然后根据以下提供的规则评估回答的质量
# 用户与AI之间的对话
## 用户问题
<|begin_of_query|>
{question}
<|end_of_query|>
## 回答A
<|begin_of_response_A|>
{prediction}
<|end_of_response_A|>
## 回答B
<|begin_of_response_B|>
{prediction2}
<|end_of_response_B|>
# 评估
## 检查清单
<|begin_of_checklist|>
{checklist}
<|end_of_checklist|>
请参考此检查清单来评估回答的质量但不要局限于此检查清单
## 规则
您应当基于用户查询分析比较上述两种回答
您应当基于检查清单写下您的分析然后提供您的评价
有五个选项供您做出最终评估["A++", "A+", "A=B", "B+", "B++"]它们对应如下含义
- `A++`回答A远胜于回答B
- `A+`回答A略优于回答B
- `A=B`回答A和回答B质量相同请谨慎使用此选项
- `B+`回答B略优于回答A
- `B++`回答B远胜于回答A
## 输出格式
首先请输出您对每个模型回答的分析
然后总结您的评估到三个方面"A=B的理由""A优于B的理由" "B优于A的理由"
最后做出您对最终评估的选择
请按照以下json格式提供您的评估结果通过填充[]中的占位符
```
{
"回答A的分析": "[回答A的分析]",
"回答B的分析": "[回答B的分析]",
"A=B的理由": "[A和B回答差不多的理由]",
"A优于B的理由": "[回答A优于B的理由]",
"B优于A的理由": "[回答B优于A的理由]",
"choice": "[A++ or A+ or A=B or B+ or B++]",
}
```
"""
checklist_datasets = []
gpt4 = [dict(
abbr='gpt4o',
)]
for lan, data_name_list in subjective_all_sets.items():
if lan == 'en':
pair_prompt = pair_prompt_en
elif lan == 'cn':
pair_prompt = pair_prompt_cn
for _name in data_name_list:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt = pair_prompt
),
]),
),
),
pred_role='BOT',
)
checklist_datasets.append(
dict(
abbr=f'{_name}',
type=CompassBenchCheklistDataset,
path='./data/subjective/compassbench_checklist',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='random',
base_models=gpt4,
))

View File

@ -2,6 +2,8 @@ from .alignbench import AlignmentBenchDataset # noqa: F401, F403
from .arena_hard import ArenaHardDataset # noqa: F401, F403 from .arena_hard import ArenaHardDataset # noqa: F401, F403
from .compass_arena import CompassArenaDataset # noqa: F401, F403 from .compass_arena import CompassArenaDataset # noqa: F401, F403
from .compassbench import CompassBenchDataset # noqa: F401, F403 from .compassbench import CompassBenchDataset # noqa: F401, F403
from .compassbench_checklist import \
CompassBenchCheklistDataset # noqa: F401, F403
from .compassbench_control_length_bias import \ from .compassbench_control_length_bias import \
CompassBenchControlLengthBiasDataset # noqa: F401, F403 CompassBenchControlLengthBiasDataset # noqa: F401, F403
from .corev2 import Corev2Dataset # noqa: F401, F403 from .corev2 import Corev2Dataset # noqa: F401, F403

View File

@ -0,0 +1,37 @@
# flake8: noqa
import json
import os.path as osp
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class CompassBenchCheklistDataset(BaseDataset):
def load(self, path: str, name: str, *args, **kwargs):
filename = osp.join(path, f'{name}.json')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
json_data = json.load(f)
for problem in json_data:
question = problem['instruction']
checklist_mardkdown = ''
if problem.get('checklist', None):
for checklist_item in problem['checklist']:
checklist_mardkdown += f'- {checklist_item}\n'
raw_data.append({
'question': question,
'checklist': checklist_mardkdown,
'judge': {
'category': problem.get('category', None),
'lan': problem.get('lan', None),
'id': problem.get('id', None),
'question': question
}
})
dataset = Dataset.from_list(raw_data)
return dataset