OpenCompass/configs/datasets/subjective/compassbench/compassbench_checklist.py

from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassBenchCheklistDataset

subjective_reader_cfg = dict(
    input_columns=['question', 'checklist'],
    output_column='judge',
)

subjective_all_sets = {
    'en': [
        'language/compass_bench_language_en_val',
        'instruct/compass_bench_instruct_en_val',
        'reasoning/compass_bench_reasoning_en_val',
        'coding/compass_bench_coding_en_val',
    ],
    'cn': [
        'language/compass_bench_language_cn_val',
        'instruct/compass_bench_instruct_cn_val',
        'reasoning/compass_bench_reasoning_cn_val',
        'coding/compass_bench_coding_cn_val',
    ],
}

data_path = './data/compassbench_v1_3/data'

pair_prompt_en = """# Instruction

You are an expert evaluator. Your task is to evaluate the quality of the \
responses generated by two AI models.
We will provide you with the user query and a pair of AI-generated \
responses (Response A and Response B).
You should first read the user query and the conversation history \
carefully for analyzing the task, and then evaluate the quality of the \
responses based on and rules provided below.

# Conversation between User and AI

## User Query
<|begin_of_query|>

{question}

<|end_of_query|>

## Response A
<|begin_of_response_A|>

{prediction}

<|end_of_response_A|>

## Response B
<|begin_of_response_B|>

{prediction2}

<|end_of_response_B|>

# Evaluation

## Checklist

<|begin_of_checklist|>

{checklist}

<|end_of_checklist|>

Please use this checklist to guide your evaluation, but do not limit your \
assessment to the checklist.

## Rules

You should compare the above two responses based on your analysis of the \
user queries and the conversation history.
You should first write down your analysis and the checklist that you used \
for the evaluation, and then provide your assessment according to the \
checklist.
There are five choices to give your final assessment: ["A++", "A+", \
"A=B", "B+", "B++"], which correspond to the following meanings:

- `A++`: Response A is much better than Response B.
- `A+`: Response A is only slightly better than Response B.
- `A=B`: Response A and B are of the same quality. Please use this \
choice sparingly.
- `B+`: Response B is only slightly better than Response A.
- `B++`: Response B is much better than Response A.

## Output Format
First, please output your analysis for each model response, and \
then summarize your assessment to three aspects: "reason A=B", \
"reason A>B", and "reason B>A", and finally make your choice for \
the final assessment.

Please provide your evaluation results in the following json \
format by filling in the placeholders in []:
```
{
    "analysis of A": "[analysis of Response A]",
    "analysis of B": "[analysis of Response B]",
    "reason of A=B": "[where Response A and B perform equally well]",
    "reason of A>B": "[where Response A is better than Response B]",
    "reason of B>A": "[where Response B is better than Response A]",
    "choice": "[A++ or A+ or A=B or B+ or B++]",
}
```
"""


pair_prompt_cn = """# 指令

您是一位专业评估专家。您的任务是评估两个AI模型生成回答的质量。
我们将为您提供用户问题及一对AI生成的回答（回答A和回答B）。
您应当首先仔细阅读用户问题，然后根据以下提供的规则评估回答的质量。

# 用户与AI之间的对话

## 用户问题
<|begin_of_query|>

{question}

<|end_of_query|>

## 回答A
<|begin_of_response_A|>

{prediction}

<|end_of_response_A|>

## 回答B
<|begin_of_response_B|>

{prediction2}

<|end_of_response_B|>

# 评估

## 检查清单

<|begin_of_checklist|>

{checklist}

<|end_of_checklist|>

请参考此检查清单来评估回答的质量，但不要局限于此检查清单。

## 规则

您应当基于用户查询，分析比较上述两种回答。
您应当基于检查清单写下您的分析，然后提供您的评价。
有五个选项供您做出最终评估：["A++", "A+", "A=B", "B+", "B++"]，它们对应如下含义：

- `A++`：回答A远胜于回答B。
- `A+`：回答A略优于回答B。
- `A=B`：回答A和回答B质量相同。请谨慎使用此选项。
- `B+`：回答B略优于回答A。
- `B++`：回答B远胜于回答A。

## 输出格式
首先，请输出您对每个模型回答的分析，
然后总结您的评估到三个方面："A=B的理由"，"A优于B的理由"，和 "B优于A的理由"，
最后做出您对最终评估的选择。

请按照以下json格式提供您的评估结果，通过填充[]中的占位符：
```
{
    "回答A的分析": "[回答A的分析]",
    "回答B的分析": "[回答B的分析]",
    "A=B的理由": "[A和B回答差不多的理由]",
    "A优于B的理由": "[回答A优于B的理由]",
    "B优于A的理由": "[回答B优于A的理由]",
    "choice": "[A++ or A+ or A=B or B+ or B++]",
}
```
"""

checklist_datasets = []
gpt4 = [
    dict(
        abbr='gpt4o',
    )
]
for lan, data_name_list in subjective_all_sets.items():
    if lan == 'en':
        pair_prompt = pair_prompt_en
    elif lan == 'cn':
        pair_prompt = pair_prompt_cn
    for _name in data_name_list:
        subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                    round=[
                        dict(role='HUMAN', prompt='{question}'),
                    ]
                ),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer, max_out_len=4096),
        )

        subjective_eval_cfg = dict(
            evaluator=dict(
                type=LMEvaluator,
                prompt_template=dict(
                    type=PromptTemplate,
                    template=dict(
                        round=[
                            dict(role='HUMAN', prompt=pair_prompt),
                        ]
                    ),
                ),
            ),
            pred_role='BOT',
        )

        checklist_datasets.append(
            dict(
                abbr=f'{_name}',
                type=CompassBenchCheklistDataset,
                path=data_path,
                name=_name,
                reader_cfg=subjective_reader_cfg,
                infer_cfg=subjective_infer_cfg,
                eval_cfg=subjective_eval_cfg,
                mode='m2n',
                infer_order='random',
                base_models=gpt4,
            )
        )
-												[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version

* fix pip version

* support checklist eval

* init

* add lan

* fix typo
											
										
										
											2024-07-19 16:40:44 +08:00
+								from opencompass.openicl.icl_prompt_template import PromptTemplate
 								from opencompass.openicl.icl_retriever import ZeroRetriever
 								from opencompass.openicl.icl_inferencer import GenInferencer
 								from opencompass.openicl.icl_evaluator import LMEvaluator
 								from opencompass.datasets import CompassBenchCheklistDataset
 								subjective_reader_cfg = dict(
-												[Feature] CompassBench v1_3 subjective evaluation (#1341)

* stash files

* compassbench subjective evaluation added

* evaluation update

* remove unneeded content

* fix lint

* update docs

* Update lint

* Update

---------

Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-19 23:12:23 +08:00
+								    input_columns=['question', 'checklist'],
-												[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version

* fix pip version

* support checklist eval

* init

* add lan

* fix typo
											
										
										
											2024-07-19 16:40:44 +08:00
+								    output_column='judge',
-												[Feature] CompassBench v1_3 subjective evaluation (#1341)

* stash files

* compassbench subjective evaluation added

* evaluation update

* remove unneeded content

* fix lint

* update docs

* Update lint

* Update

---------

Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-19 23:12:23 +08:00
+								)
 								subjective_all_sets = {
 								    'en': [
 								        'language/compass_bench_language_en_val',
 								        'instruct/compass_bench_instruct_en_val',
 								        'reasoning/compass_bench_reasoning_en_val',
 								        'coding/compass_bench_coding_en_val',
 								    ],
 								    'cn': [
 								        'language/compass_bench_language_cn_val',
 								        'instruct/compass_bench_instruct_cn_val',
 								        'reasoning/compass_bench_reasoning_cn_val',
 								        'coding/compass_bench_coding_cn_val',
 								    ],
 								}
-												[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version

* fix pip version

* support checklist eval

* init

* add lan

* fix typo
											
										
										
											2024-07-19 16:40:44 +08:00
-												[Feature] CompassBench v1_3 subjective evaluation (#1341)

* stash files

* compassbench subjective evaluation added

* evaluation update

* remove unneeded content

* fix lint

* update docs

* Update lint

* Update

---------

Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-19 23:12:23 +08:00
+								data_path = './data/compassbench_v1_3/data'
-												[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version

* fix pip version

* support checklist eval

* init

* add lan

* fix typo
											
										
										
											2024-07-19 16:40:44 +08:00
 								pair_prompt_en = """# Instruction
 								You are an expert evaluator. Your task is to evaluate the quality of the \
 								responses generated by two AI models.
 								We will provide you with the user query and a pair of AI-generated \
 								responses (Response A and Response B).
 								You should first read the user query and the conversation history \
 								carefully for analyzing the task, and then evaluate the quality of the \
 								responses based on and rules provided below.
 								# Conversation between User and AI
 								## User Query
 								<|begin_of_query|>
 								{question}
 								<|end_of_query|>
 								## Response A
 								<|begin_of_response_A|>
 								{prediction}
 								<|end_of_response_A|>
 								## Response B
 								<|begin_of_response_B|>
 								{prediction2}
 								<|end_of_response_B|>
 								# Evaluation
 								## Checklist
 								<|begin_of_checklist|>
 								{checklist}
 								<|end_of_checklist|>
 								Please use this checklist to guide your evaluation, but do not limit your \
 								assessment to the checklist.
 								## Rules
 								You should compare the above two responses based on your analysis of the \
 								user queries and the conversation history.
 								You should first write down your analysis and the checklist that you used \
 								for the evaluation, and then provide your assessment according to the \
 								checklist.
 								There are five choices to give your final assessment: ["A++", "A+", \
 								"A=B", "B+", "B++"], which correspond to the following meanings:
 								- `A++`: Response A is much better than Response B.
 								- `A+`: Response A is only slightly better than Response B.
 								- `A=B`: Response A and B are of the same quality. Please use this \
 								choice sparingly.
 								- `B+`: Response B is only slightly better than Response A.
 								- `B++`: Response B is much better than Response A.
 								## Output Format
 								First, please output your analysis for each model response, and \
 								then summarize your assessment to three aspects: "reason A=B", \
 								"reason A>B", and "reason B>A", and finally make your choice for \
 								the final assessment.
 								Please provide your evaluation results in the following json \
 								format by filling in the placeholders in []:
 								```
 								{
 								    "analysis of A": "[analysis of Response A]",
 								    "analysis of B": "[analysis of Response B]",
 								    "reason of A=B": "[where Response A and B perform equally well]",
 								    "reason of A>B": "[where Response A is better than Response B]",
 								    "reason of B>A": "[where Response B is better than Response A]",
 								    "choice": "[A++ or A+ or A=B or B+ or B++]",
 								}
 								```
 								"""
 								pair_prompt_cn = """# 指令
 								您是一位专业评估专家。您的任务是评估两个AI模型生成回答的质量。
 								我们将为您提供用户问题及一对AI生成的回答（回答A和回答B）。
 								您应当首先仔细阅读用户问题，然后根据以下提供的规则评估回答的质量。
 								# 用户与AI之间的对话
 								## 用户问题
 								<|begin_of_query|>
 								{question}
 								<|end_of_query|>
 								## 回答A
 								<|begin_of_response_A|>
 								{prediction}
 								<|end_of_response_A|>
 								## 回答B
 								<|begin_of_response_B|>
 								{prediction2}
 								<|end_of_response_B|>
 								# 评估
 								## 检查清单
 								<|begin_of_checklist|>
 								{checklist}
 								<|end_of_checklist|>
 								请参考此检查清单来评估回答的质量，但不要局限于此检查清单。
 								## 规则
 								您应当基于用户查询，分析比较上述两种回答。
 								您应当基于检查清单写下您的分析，然后提供您的评价。
 								有五个选项供您做出最终评估：["A++", "A+", "A=B", "B+", "B++"]，它们对应如下含义：
 								- `A++`：回答A远胜于回答B。
 								- `A+`：回答A略优于回答B。
 								- `A=B`：回答A和回答B质量相同。请谨慎使用此选项。
 								- `B+`：回答B略优于回答A。
 								- `B++`：回答B远胜于回答A。
 								## 输出格式
 								首先，请输出您对每个模型回答的分析，
 								然后总结您的评估到三个方面："A=B的理由"，"A优于B的理由"，和 "B优于A的理由"，
 								最后做出您对最终评估的选择。
 								请按照以下json格式提供您的评估结果，通过填充[]中的占位符：
 								```
 								{
 								    "回答A的分析": "[回答A的分析]",
 								    "回答B的分析": "[回答B的分析]",
 								    "A=B的理由": "[A和B回答差不多的理由]",
 								    "A优于B的理由": "[回答A优于B的理由]",
 								    "B优于A的理由": "[回答B优于A的理由]",
 								    "choice": "[A++ or A+ or A=B or B+ or B++]",
 								}
 								```
 								"""
 								checklist_datasets = []
-												[Feature] CompassBench v1_3 subjective evaluation (#1341)

* stash files

* compassbench subjective evaluation added

* evaluation update

* remove unneeded content

* fix lint

* update docs

* Update lint

* Update

---------

Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-19 23:12:23 +08:00
+								gpt4 = [
 								    dict(
 								        abbr='gpt4o',
 								    )
 								]
-												[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version

* fix pip version

* support checklist eval

* init

* add lan

* fix typo
											
										
										
											2024-07-19 16:40:44 +08:00
+								for lan, data_name_list in subjective_all_sets.items():
 								    if lan == 'en':
 								        pair_prompt = pair_prompt_en
 								    elif lan == 'cn':
 								        pair_prompt = pair_prompt_cn
 								    for _name in data_name_list:
 								        subjective_infer_cfg = dict(
-												[Feature] CompassBench v1_3 subjective evaluation (#1341)

* stash files

* compassbench subjective evaluation added

* evaluation update

* remove unneeded content

* fix lint

* update docs

* Update lint

* Update

---------

Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-19 23:12:23 +08:00
+								            prompt_template=dict(
 								                type=PromptTemplate,
 								                template=dict(
 								                    round=[
 								                        dict(role='HUMAN', prompt='{question}'),
 								                    ]
-												[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version

* fix pip version

* support checklist eval

* init

* add lan

* fix typo
											
										
										
											2024-07-19 16:40:44 +08:00
+								                ),
-												[Feature] CompassBench v1_3 subjective evaluation (#1341)

* stash files

* compassbench subjective evaluation added

* evaluation update

* remove unneeded content

* fix lint

* update docs

* Update lint

* Update

---------

Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-19 23:12:23 +08:00
+								            ),
 								            retriever=dict(type=ZeroRetriever),
 								            inferencer=dict(type=GenInferencer, max_out_len=4096),
 								        )
-												[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version

* fix pip version

* support checklist eval

* init

* add lan

* fix typo
											
										
										
											2024-07-19 16:40:44 +08:00
 								        subjective_eval_cfg = dict(
 								            evaluator=dict(
 								                type=LMEvaluator,
 								                prompt_template=dict(
 								                    type=PromptTemplate,
 								                    template=dict(
-												[Feature] CompassBench v1_3 subjective evaluation (#1341)

* stash files

* compassbench subjective evaluation added

* evaluation update

* remove unneeded content

* fix lint

* update docs

* Update lint

* Update

---------

Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-19 23:12:23 +08:00
+								                        round=[
 								                            dict(role='HUMAN', prompt=pair_prompt),
 								                        ]
-												[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version

* fix pip version

* support checklist eval

* init

* add lan

* fix typo
											
										
										
											2024-07-19 16:40:44 +08:00
+								                    ),
 								                ),
 								            ),
 								            pred_role='BOT',
 								        )
 								        checklist_datasets.append(
 								            dict(
 								                abbr=f'{_name}',
 								                type=CompassBenchCheklistDataset,
-												[Feature] CompassBench v1_3 subjective evaluation (#1341)

* stash files

* compassbench subjective evaluation added

* evaluation update

* remove unneeded content

* fix lint

* update docs

* Update lint

* Update

---------

Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-19 23:12:23 +08:00
+								                path=data_path,
-												[Feature] support compassbench Checklist evaluation (#1339)

* fix pip version

* fix pip version

* support checklist eval

* init

* add lan

* fix typo
											
										
										
											2024-07-19 16:40:44 +08:00
+								                name=_name,
 								                reader_cfg=subjective_reader_cfg,
 								                infer_cfg=subjective_infer_cfg,
 								                eval_cfg=subjective_eval_cfg,
 								                mode='m2n',
 								                infer_order='random',
 								                base_models=gpt4,
-												[Feature] CompassBench v1_3 subjective evaluation (#1341)

* stash files

* compassbench subjective evaluation added

* evaluation update

* remove unneeded content

* fix lint

* update docs

* Update lint

* Update

---------

Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-19 23:12:23 +08:00
+								            )
 								        )