[Update] update docs and add compassarena (#1614)

* fix pip version * fix pip version * update docs and add compassarena * update docs
2025-05-30 16:03:24 +08:00 · 2024-10-17 14:39:06 +08:00 · 2024-10-17 14:39:06 +08:00 · f0d436496e
commit f0d436496e
parent 4fe251729b
6 changed files with 406 additions and 2 deletions
--- a/configs/datasets/subjective/compassarena/compassarena_compare.py
+++ b/configs/datasets/subjective/compassarena/compassarena_compare.py
@ -0,0 +1,142 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
+from opencompass.summarizers import CompassArenaSummarizer
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'ref'],
+    output_column='judge',
+    )
+
+data_path ='data/subjective/compass_arena'
+
+compassarena_datasets = []
+
+base_prompt = """
+[回答1开始]
+{prediction}
+[回答1结束]
+[回答2开始]
+{prediction2}
+[回答2结束]
+根据评分要求，在以下 3 个选项中做出选择:
+A. 回答1更好
+B. 回答2更好
+C. 回答1、2平局
+并提供你的解释原因。
+如果你认为回答1更好，你的输出应形如：
+选择：A
+原因：blahblah blahblah\n
+如果你认为回答2更好，你的输出应形如：
+选择：B
+原因：blahblah blahblah\n
+如果你认为回答1、2打成平手，你的输出应形如：
+选择：C
+原因：blahblah blahblah\n
+"""
+
+knowledge_prompt = """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答能与参考答案吻合或表明参考答案的意思。
+2. 在都准确答对问题的前提下，更好的回答能对知识点进行额外补充，且补充的知识准确无误。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+
+language_prompt = """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 在有明确的参考答案的情况下，越贴近参考答案或表明了参考答案的意思的回答越好。
+2. 更好的回答在语言表达上更流畅，更加符合与人类对话的习惯，包括语气、情调等
+3. 在都准确答对问题的前提下，更好的回答能进行额外补充，且补充的内容准确无误。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+
+math_prompt = """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答的答案能和参考答案一致。
+2. 若两个回答的答案都与参考答案不一致，则更好的回答的推理过程应更加合理。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+reason_prompt = math_prompt
+
+creation_prompt = """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
+2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
+3. 好的回答必须具有创造性的词语和表达丰富度
+[用户问题]
+{question}
+""" + base_prompt
+
+sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
+
+gpt4 = [dict(
+    abbr='gpt4-turbo',
+)]
+
+for _name, _prompt in sub_map.items():
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = _prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
+        ),
+        pred_role='BOT',
+    )
+
+    compassarena_datasets.append(
+        dict(
+            abbr=f'compassarena_{_name}',
+            type=CompassArenaDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='double',
+            base_models=gpt4,
+            # summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
+            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
+        ))
--- a/docs/en/advanced_guides/subjective_evaluation.md
+++ b/docs/en/advanced_guides/subjective_evaluation.md
@ -127,7 +127,8 @@ Taking Alignbench as an example, `configs/datasets/subjective/alignbench/alignbe
 1. First, you need to set `subjective_reader_cfg` to receive the relevant fields returned from the custom Dataset class and specify the output fields when saving files.
 2. Then, you need to specify the root path `data_path` of the dataset and the dataset filename `subjective_all_sets`. If there are multiple sub-files, you can add them to this list.
 3. Specify `subjective_infer_cfg` and `subjective_eval_cfg` to configure the corresponding inference and evaluation prompts.
-4. Finally, specify additional information such as `mode`, `summarizer`, etc., at the appropriate location. Note that for different subjective datasets, the fields that need to be specified may vary. Additionally, the summarizer class for the respective dataset also needs to be implemented to perform data statistics. You can refer to the summarizer implementations of other datasets, located in `opencompass/opencompass/summarizers/subjective`.
+4. Specify additional information such as `mode` at the corresponding location. Note that the fields required for different subjective datasets may vary.
+5. Define post-processing and score statistics. For example, the postprocessing function `alignbench_postprocess` located under `opencompass/opencompass/datasets/subjective/alignbench`.

 ### Step-3: Launch the Evaluation

--- a/docs/zh_cn/advanced_guides/subjective_evaluation.md
+++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md
@ -128,7 +128,8 @@ judgemodel通常被设置为GPT4等强力模型，可以直接按照config文件
 1. 首先需要设置`subjective_reader_cfg`，用以接收从自定义的Dataset类里return回来的相关字段并指定保存文件时的output字段
 2. 然后需要指定数据集的根路径`data_path`以及数据集的文件名`subjective_all_sets`，如果有多个子文件，在这个list里进行添加即可
 3. 指定`subjective_infer_cfg`和`subjective_eval_cfg`，配置好相应的推理和评测的prompt
-4. 最后在相应的位置指定`mode`，`summarizer`等额外信息，注意，对于不同的主观数据集，所需指定的字段可能不尽相同。此外，相应数据集的summarizer类也需要自己实现以进行数据的统计，可以参考其他数据集的summarizer实现，位于`opencompass/opencompass/summarizers/subjective`
+4. 在相应的位置指定`mode`等额外信息，注意，对于不同的主观数据集，所需指定的字段可能不尽相同。
+5. 定义后处理与得分统计。例如opencompass/opencompass/datasets/subjective/alignbench下的alignbench_postprocess处理函数

 ### 第三步 启动评测并输出评测结果

--- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py
+++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py
@ -0,0 +1,142 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
+from opencompass.summarizers import CompassArenaSummarizer
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'ref'],
+    output_column='judge',
+    )
+
+data_path ='data/subjective/compass_arena'
+
+compassarena_datasets = []
+
+base_prompt = """
+[回答1开始]
+{prediction}
+[回答1结束]
+[回答2开始]
+{prediction2}
+[回答2结束]
+根据评分要求，在以下 3 个选项中做出选择:
+A. 回答1更好
+B. 回答2更好
+C. 回答1、2平局
+并提供你的解释原因。
+如果你认为回答1更好，你的输出应形如：
+选择：A
+原因：blahblah blahblah\n
+如果你认为回答2更好，你的输出应形如：
+选择：B
+原因：blahblah blahblah\n
+如果你认为回答1、2打成平手，你的输出应形如：
+选择：C
+原因：blahblah blahblah\n
+"""
+
+knowledge_prompt = """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答能与参考答案吻合或表明参考答案的意思。
+2. 在都准确答对问题的前提下，更好的回答能对知识点进行额外补充，且补充的知识准确无误。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+
+language_prompt = """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 在有明确的参考答案的情况下，越贴近参考答案或表明了参考答案的意思的回答越好。
+2. 更好的回答在语言表达上更流畅，更加符合与人类对话的习惯，包括语气、情调等
+3. 在都准确答对问题的前提下，更好的回答能进行额外补充，且补充的内容准确无误。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+
+math_prompt = """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答的答案能和参考答案一致。
+2. 若两个回答的答案都与参考答案不一致，则更好的回答的推理过程应更加合理。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+[用户问题]
+{question}
+[参考答案]
+{ref}
+""" + base_prompt
+
+reason_prompt = math_prompt
+
+creation_prompt = """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
+2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
+3. 好的回答必须具有创造性的词语和表达丰富度
+[用户问题]
+{question}
+""" + base_prompt
+
+sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
+
+gpt4 = [dict(
+    abbr='gpt4-turbo',
+)]
+
+for _name, _prompt in sub_map.items():
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = _prompt
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
+        ),
+        pred_role='BOT',
+    )
+
+    compassarena_datasets.append(
+        dict(
+            abbr=f'compassarena_{_name}',
+            type=CompassArenaDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='m2n',
+            infer_order='double',
+            base_models=gpt4,
+            # summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
+            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
+        ))
--- a/opencompass/datasets/subjective/init.py
+++ b/opencompass/datasets/subjective/init.py
@ -5,6 +5,7 @@ from .alpacaeval import AlpacaEvalDataset  # noqa: F401, F403
 from .alpacaeval import alpacaeval_postprocess  # noqa: F401, F403
 from .arena_hard import ArenaHardDataset  # noqa: F401, F403
 from .arena_hard import arenahard_postprocess  # noqa: F401, F403
+from .compass_arena import CompassArenaDataset, compassarena_postprocess
 from .compassbench import CompassBenchDataset  # noqa: F401, F403
 from .compassbench_checklist import \
    CompassBenchCheklistDataset  # noqa: F401, F403
--- a/opencompass/datasets/subjective/compass_arena.py
+++ b/opencompass/datasets/subjective/compass_arena.py
@ -0,0 +1,117 @@
+# flake8: noqa: E501
+import re
+from collections import defaultdict
+
+from datasets import Dataset
+
+from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
+
+from .subjective_cmp import SubjectiveCmpDataset
+from .utils import get_judgeanswer_and_reference
+
+
+@LOAD_DATASET.register_module()
+class CompassArenaDataset(SubjectiveCmpDataset):
+
+    def load(self, path: str, name: str, *args, **kwargs):
+        dataset = list(super().load(path, name))
+        creation_dataset = []
+        for data in dataset:
+            if 'reference' in data['others']:
+                if data['others']['reference'] is not None:
+                    data['ref'] = data['others']['reference']
+                else:
+                    data['ref'] = '满足用户需求，言之有理即可'
+            else:
+                data['ref'] = '满足用户需求，言之有理即可'
+            creation_dataset.append(data)
+        dataset = Dataset.from_list(creation_dataset)
+        return dataset
+
+
+def check_position_bias(judged_answers, references, banned_choice=['C']):
+    """Check position bias for judgellm's judgement.
+
+    Args:
+        judged_answers: The successfully extracted judgement.
+        references: The references contains original question, which is used to located the same question for different position judgement.
+    """
+    position_bias_flag = 0
+    position_bias_dict = {}
+    for judge, ref in zip(judged_answers, references):
+        question = ref['question']
+        question_hash = hash(question)
+        if question_hash not in position_bias_dict:
+            position_bias_dict[question_hash] = {
+                'question': question,
+                'judge': judge
+            }
+        else:
+            first_judge = position_bias_dict[question_hash]['judge']
+            if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice:
+                # If second choice is same with first choice, there has position bias.
+                position_bias_flag += 1
+    return position_bias_flag
+
+
+def post_process_compassarena(item):
+    s = item['prediction']
+    if result := re.findall('(?:选择：|Choice: )([ABC])', s):
+        return result[0]
+    else:
+        return None
+
+
+@DICT_POSTPROCESSORS.register_module('compassarena')
+def compassarena_postprocess(output: dict,
+                             output_path: str,
+                             summary_type='half_add',
+                             check_pos_bias=True) -> dict:
+    judged_answers, references = get_judgeanswer_and_reference(
+        output, output_path, post_process_compassarena)
+
+    if check_pos_bias:
+        bias_num = check_position_bias(judged_answers, references)
+    else:
+        bias_num = 0
+
+    win_model1 = defaultdict(float)
+    win_model2 = defaultdict(float)
+    categories = defaultdict(float)
+    model1 = references[0]['answer1']
+
+    for prediction, reference in zip(judged_answers, references):
+        categories[reference['capability']] += 1
+
+        if prediction == 'A':
+            if reference['answer1'] == model1:
+                score_1, score_2 = 1, 0
+            else:
+                score_1, score_2 = 0, 1
+        elif prediction == 'B':
+            if reference['answer1'] == model1:
+                score_1, score_2 = 0, 1
+            else:
+                score_1, score_2 = 1, 0
+        elif prediction == 'C':
+            if summary_type == 'half_add':
+                score_1, score_2 = 0.5, 0.5
+            else:
+                score_1, score_2 = 0, 0
+
+        win_model1[reference['capability']] += score_1
+        win_model2[reference['capability']] += score_2
+    for capability in categories:
+        win_model1[
+            capability] = win_model1[capability] / categories[capability] * 100
+        win_model1[capability] = round(win_model1[capability], 2)
+        win_model2[
+            capability] = win_model2[capability] / categories[capability] * 100
+        win_model2[capability] = round(win_model2[capability], 2)
+
+    win_model1['position_bias'] = bias_num
+    win_model2['position_bias'] = bias_num
+
+    results = win_model2
+    results['details'] = output
+    return results