OpenCompass/configs/datasets/subjective/compassbench/compassbench_checklist.py
Linchen Xiao a56678190b
[Feature] CompassBench v1_3 subjective evaluation (#1341)
* stash files

* compassbench subjective evaluation added

* evaluation update

* remove unneeded content

* fix lint

* update docs

* Update lint

* Update

---------

Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
2024-07-19 23:12:23 +08:00

238 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassBenchCheklistDataset
subjective_reader_cfg = dict(
input_columns=['question', 'checklist'],
output_column='judge',
)
subjective_all_sets = {
'en': [
'language/compass_bench_language_en_val',
'instruct/compass_bench_instruct_en_val',
'reasoning/compass_bench_reasoning_en_val',
'coding/compass_bench_coding_en_val',
],
'cn': [
'language/compass_bench_language_cn_val',
'instruct/compass_bench_instruct_cn_val',
'reasoning/compass_bench_reasoning_cn_val',
'coding/compass_bench_coding_cn_val',
],
}
data_path = './data/compassbench_v1_3/data'
pair_prompt_en = """# Instruction
You are an expert evaluator. Your task is to evaluate the quality of the \
responses generated by two AI models.
We will provide you with the user query and a pair of AI-generated \
responses (Response A and Response B).
You should first read the user query and the conversation history \
carefully for analyzing the task, and then evaluate the quality of the \
responses based on and rules provided below.
# Conversation between User and AI
## User Query
<|begin_of_query|>
{question}
<|end_of_query|>
## Response A
<|begin_of_response_A|>
{prediction}
<|end_of_response_A|>
## Response B
<|begin_of_response_B|>
{prediction2}
<|end_of_response_B|>
# Evaluation
## Checklist
<|begin_of_checklist|>
{checklist}
<|end_of_checklist|>
Please use this checklist to guide your evaluation, but do not limit your \
assessment to the checklist.
## Rules
You should compare the above two responses based on your analysis of the \
user queries and the conversation history.
You should first write down your analysis and the checklist that you used \
for the evaluation, and then provide your assessment according to the \
checklist.
There are five choices to give your final assessment: ["A++", "A+", \
"A=B", "B+", "B++"], which correspond to the following meanings:
- `A++`: Response A is much better than Response B.
- `A+`: Response A is only slightly better than Response B.
- `A=B`: Response A and B are of the same quality. Please use this \
choice sparingly.
- `B+`: Response B is only slightly better than Response A.
- `B++`: Response B is much better than Response A.
## Output Format
First, please output your analysis for each model response, and \
then summarize your assessment to three aspects: "reason A=B", \
"reason A>B", and "reason B>A", and finally make your choice for \
the final assessment.
Please provide your evaluation results in the following json \
format by filling in the placeholders in []:
```
{
"analysis of A": "[analysis of Response A]",
"analysis of B": "[analysis of Response B]",
"reason of A=B": "[where Response A and B perform equally well]",
"reason of A>B": "[where Response A is better than Response B]",
"reason of B>A": "[where Response B is better than Response A]",
"choice": "[A++ or A+ or A=B or B+ or B++]",
}
```
"""
pair_prompt_cn = """# 指令
您是一位专业评估专家。您的任务是评估两个AI模型生成回答的质量。
我们将为您提供用户问题及一对AI生成的回答回答A和回答B
您应当首先仔细阅读用户问题,然后根据以下提供的规则评估回答的质量。
# 用户与AI之间的对话
## 用户问题
<|begin_of_query|>
{question}
<|end_of_query|>
## 回答A
<|begin_of_response_A|>
{prediction}
<|end_of_response_A|>
## 回答B
<|begin_of_response_B|>
{prediction2}
<|end_of_response_B|>
# 评估
## 检查清单
<|begin_of_checklist|>
{checklist}
<|end_of_checklist|>
请参考此检查清单来评估回答的质量,但不要局限于此检查清单。
## 规则
您应当基于用户查询,分析比较上述两种回答。
您应当基于检查清单写下您的分析,然后提供您的评价。
有五个选项供您做出最终评估:["A++", "A+", "A=B", "B+", "B++"],它们对应如下含义:
- `A++`回答A远胜于回答B。
- `A+`回答A略优于回答B。
- `A=B`回答A和回答B质量相同。请谨慎使用此选项。
- `B+`回答B略优于回答A。
- `B++`回答B远胜于回答A。
## 输出格式
首先,请输出您对每个模型回答的分析,
然后总结您的评估到三个方面:"A=B的理由""A优于B的理由",和 "B优于A的理由"
最后做出您对最终评估的选择。
请按照以下json格式提供您的评估结果通过填充[]中的占位符:
```
{
"回答A的分析": "[回答A的分析]",
"回答B的分析": "[回答B的分析]",
"A=B的理由": "[A和B回答差不多的理由]",
"A优于B的理由": "[回答A优于B的理由]",
"B优于A的理由": "[回答B优于A的理由]",
"choice": "[A++ or A+ or A=B or B+ or B++]",
}
```
"""
checklist_datasets = []
gpt4 = [
dict(
abbr='gpt4o',
)
]
for lan, data_name_list in subjective_all_sets.items():
if lan == 'en':
pair_prompt = pair_prompt_en
elif lan == 'cn':
pair_prompt = pair_prompt_cn
for _name in data_name_list:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=pair_prompt),
]
),
),
),
pred_role='BOT',
)
checklist_datasets.append(
dict(
abbr=f'{_name}',
type=CompassBenchCheklistDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='random',
base_models=gpt4,
)
)