[Update] update docs and add compassarena (#1614)

* fix pip version

* fix pip version

* update docs and add compassarena

* update docs
This commit is contained in:
bittersweet1999 2024-10-17 14:39:06 +08:00 committed by GitHub
parent 4fe251729b
commit f0d436496e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 406 additions and 2 deletions

View File

@ -0,0 +1,142 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'ref'],
output_column='judge',
)
data_path ='data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
"""
knowledge_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
language_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
math_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
reason_prompt = math_prompt
creation_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
""" + base_prompt
sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name, _prompt in sub_map.items():
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = _prompt
),
]),
),
dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
),
pred_role='BOT',
)
compassarena_datasets.append(
dict(
abbr=f'compassarena_{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
))

View File

@ -127,7 +127,8 @@ Taking Alignbench as an example, `configs/datasets/subjective/alignbench/alignbe
1. First, you need to set `subjective_reader_cfg` to receive the relevant fields returned from the custom Dataset class and specify the output fields when saving files.
2. Then, you need to specify the root path `data_path` of the dataset and the dataset filename `subjective_all_sets`. If there are multiple sub-files, you can add them to this list.
3. Specify `subjective_infer_cfg` and `subjective_eval_cfg` to configure the corresponding inference and evaluation prompts.
4. Finally, specify additional information such as `mode`, `summarizer`, etc., at the appropriate location. Note that for different subjective datasets, the fields that need to be specified may vary. Additionally, the summarizer class for the respective dataset also needs to be implemented to perform data statistics. You can refer to the summarizer implementations of other datasets, located in `opencompass/opencompass/summarizers/subjective`.
4. Specify additional information such as `mode` at the corresponding location. Note that the fields required for different subjective datasets may vary.
5. Define post-processing and score statistics. For example, the postprocessing function `alignbench_postprocess` located under `opencompass/opencompass/datasets/subjective/alignbench`.
### Step-3: Launch the Evaluation

View File

@ -128,7 +128,8 @@ judgemodel通常被设置为GPT4等强力模型可以直接按照config文件
1. 首先需要设置`subjective_reader_cfg`用以接收从自定义的Dataset类里return回来的相关字段并指定保存文件时的output字段
2. 然后需要指定数据集的根路径`data_path`以及数据集的文件名`subjective_all_sets`如果有多个子文件在这个list里进行添加即可
3. 指定`subjective_infer_cfg`和`subjective_eval_cfg`配置好相应的推理和评测的prompt
4. 最后在相应的位置指定`mode``summarizer`等额外信息注意对于不同的主观数据集所需指定的字段可能不尽相同。此外相应数据集的summarizer类也需要自己实现以进行数据的统计可以参考其他数据集的summarizer实现位于`opencompass/opencompass/summarizers/subjective`
4. 在相应的位置指定`mode`等额外信息,注意,对于不同的主观数据集,所需指定的字段可能不尽相同。
5. 定义后处理与得分统计。例如opencompass/opencompass/datasets/subjective/alignbench下的alignbench_postprocess处理函数
### 第三步 启动评测并输出评测结果

View File

@ -0,0 +1,142 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'ref'],
output_column='judge',
)
data_path ='data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
"""
knowledge_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
language_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
math_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
reason_prompt = math_prompt
creation_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
""" + base_prompt
sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name, _prompt in sub_map.items():
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = _prompt
),
]),
),
dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
),
pred_role='BOT',
)
compassarena_datasets.append(
dict(
abbr=f'compassarena_{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
))

View File

@ -5,6 +5,7 @@ from .alpacaeval import AlpacaEvalDataset # noqa: F401, F403
from .alpacaeval import alpacaeval_postprocess # noqa: F401, F403
from .arena_hard import ArenaHardDataset # noqa: F401, F403
from .arena_hard import arenahard_postprocess # noqa: F401, F403
from .compass_arena import CompassArenaDataset, compassarena_postprocess
from .compassbench import CompassBenchDataset # noqa: F401, F403
from .compassbench_checklist import \
CompassBenchCheklistDataset # noqa: F401, F403

View File

@ -0,0 +1,117 @@
# flake8: noqa: E501
import re
from collections import defaultdict
from datasets import Dataset
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from .subjective_cmp import SubjectiveCmpDataset
from .utils import get_judgeanswer_and_reference
@LOAD_DATASET.register_module()
class CompassArenaDataset(SubjectiveCmpDataset):
def load(self, path: str, name: str, *args, **kwargs):
dataset = list(super().load(path, name))
creation_dataset = []
for data in dataset:
if 'reference' in data['others']:
if data['others']['reference'] is not None:
data['ref'] = data['others']['reference']
else:
data['ref'] = '满足用户需求,言之有理即可'
else:
data['ref'] = '满足用户需求,言之有理即可'
creation_dataset.append(data)
dataset = Dataset.from_list(creation_dataset)
return dataset
def check_position_bias(judged_answers, references, banned_choice=['C']):
"""Check position bias for judgellm's judgement.
Args:
judged_answers: The successfully extracted judgement.
references: The references contains original question, which is used to located the same question for different position judgement.
"""
position_bias_flag = 0
position_bias_dict = {}
for judge, ref in zip(judged_answers, references):
question = ref['question']
question_hash = hash(question)
if question_hash not in position_bias_dict:
position_bias_dict[question_hash] = {
'question': question,
'judge': judge
}
else:
first_judge = position_bias_dict[question_hash]['judge']
if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice:
# If second choice is same with first choice, there has position bias.
position_bias_flag += 1
return position_bias_flag
def post_process_compassarena(item):
s = item['prediction']
if result := re.findall('(?:选择:|Choice: )([ABC])', s):
return result[0]
else:
return None
@DICT_POSTPROCESSORS.register_module('compassarena')
def compassarena_postprocess(output: dict,
output_path: str,
summary_type='half_add',
check_pos_bias=True) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
output, output_path, post_process_compassarena)
if check_pos_bias:
bias_num = check_position_bias(judged_answers, references)
else:
bias_num = 0
win_model1 = defaultdict(float)
win_model2 = defaultdict(float)
categories = defaultdict(float)
model1 = references[0]['answer1']
for prediction, reference in zip(judged_answers, references):
categories[reference['capability']] += 1
if prediction == 'A':
if reference['answer1'] == model1:
score_1, score_2 = 1, 0
else:
score_1, score_2 = 0, 1
elif prediction == 'B':
if reference['answer1'] == model1:
score_1, score_2 = 0, 1
else:
score_1, score_2 = 1, 0
elif prediction == 'C':
if summary_type == 'half_add':
score_1, score_2 = 0.5, 0.5
else:
score_1, score_2 = 0, 0
win_model1[reference['capability']] += score_1
win_model2[reference['capability']] += score_2
for capability in categories:
win_model1[
capability] = win_model1[capability] / categories[capability] * 100
win_model1[capability] = round(win_model1[capability], 2)
win_model2[
capability] = win_model2[capability] / categories[capability] * 100
win_model2[capability] = round(win_model2[capability], 2)
win_model1['position_bias'] = bias_num
win_model2['position_bias'] = bias_num
results = win_model2
results['details'] = output
return results