[Feature] Add configs for creationbench (#791)

* add creationv2_zh * add creationv2_zh * add eng config for creationbench * add eng config for creationbench * add eng config for creationbench
2025-05-30 16:03:24 +08:00 · 2024-01-12 14:20:21 +08:00 · 2024-01-12 14:20:21 +08:00 · 83d6c48378
commit 83d6c48378
parent d0dc3534e5
4 changed files with 349 additions and 4 deletions
--- a/configs/datasets/subjective_creationbench/creationbench_judgeby_gpt4_withref.py
+++ b/configs/datasets/subjective_creationbench/creationbench_judgeby_gpt4_withref.py
@ -0,0 +1,60 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import CreationBenchDataset
 subjective_reader_cfg = dict(
    input_columns=['question', 'capability', 'score_with_ref_prefix', 'score_with_ref_suffix'],
    output_column='judge',
    )
 subjective_all_sets = [
    "creationv2_zh",
 ]
 data_path ="data/subjective/"
 subjective_datasets = []
 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt="{question}"
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
        )
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt = "{score_with_ref_prefix}{prediction}{score_with_ref_suffix}"
                    ),
                ]),
            ),
        ),
        pred_role="BOT",
    )
    subjective_datasets.append(
        dict(
            abbr=f"{_name}",
            type=CreationBenchDataset,
            multi_dimension=True,
            path=data_path,
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg
        ))
--- a/configs/eval_subjective_alignbench.py
+++ b/configs/eval_subjective_alignbench.py
@ -14,7 +14,8 @@ with read_base():
 datasets = [*subjective_datasets]
-from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
 from opencompass.models.openai_api import OpenAIAllesAPIN
 from opencompass.partitioners import NaivePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
@ -59,7 +60,8 @@ judge_model = dict(
        query_per_second=16,
        max_out_len=2048,
        max_seq_len=2048,
-        batch_size=8
+        batch_size=8,
        temperature = 0
 )
 ## ------------- Evaluation Configuration
--- a/configs/eval_subjective_creationbench.py
+++ b/configs/eval_subjective_creationbench.py
@ -0,0 +1,87 @@
 from os import getenv as gv
 from mmengine.config import read_base
 with read_base():
    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
    from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
    from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
    from .models.judge_llm.auto_j.hf_autoj_eng_13b import models as hf_autoj
    from .models.judge_llm.judgelm.hf_judgelm_33b_v1 import models as hf_judgelm
    from .models.judge_llm.pandalm.hf_pandalm_7b_v1 import models as hf_pandalm
    from .datasets.subjective_creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets
 datasets = [*subjective_datasets]
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
 from opencompass.models.openai_api import OpenAIAllesAPIN
 from opencompass.partitioners import NaivePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.runners import SlurmSequentialRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import CreationBenchSummarizer
 # -------------Inferen Stage ----------------------------------------
 models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]
 infer = dict(
    partitioner=dict(type=NaivePartitioner),
    runner=dict(
        type=SlurmSequentialRunner,
        partition='llmeval',
        quotatype='auto',
        max_num_workers=256,
        task=dict(type=OpenICLInferTask)),
 )
 # -------------Evalation Stage ----------------------------------------
 ## ------------- JudgeLLM Configuration
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ]
 )
 judge_model = dict(
        abbr='GPT4-Turbo',
        type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
        key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
        url='xxxx',
        meta_template=api_meta_template,
        query_per_second=16,
        max_out_len=2048,
        max_seq_len=2048,
        batch_size=8,
        temperature = 0
 )
 ## ------------- Evaluation Configuration
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
        mode='singlescore',
        models = [*hf_baichuan2_7b]
    ),
    runner=dict(
        type=LocalRunner,
        max_num_workers=2,
        task=dict(
            type=SubjectiveEvalTask,
            judge_cfg=judge_model
        )),
 )
 summarizer = dict(
    type=CreationBenchSummarizer, judge_type = 'general'
 )
 work_dir = 'outputs/creationbench/'
--- a/opencompass/datasets/subject_creationbench.py
+++ b/opencompass/datasets/subject_creationbench.py
@ -15,7 +15,9 @@ You are an assistant skilled at evaluating the quality of creative text.
 Please evaluate the quality of an AI model's response to a creative question in the capacity of an impartial judge. You'll need to assess the response on the following dimensions: Creativity, Richness, User Demand Fulfillment, and Logical Coherence. We will provide you with a creative question and the AI model's response for evaluation. As you begin your assessment, follow this process:
 1. Evaluate the AI model's answers on different dimensions, pointing out its strengths or weaknesses in each dimension and assigning a score of 1 to 10 for each.
 2. Finally, based on the assessments across dimensions, provide an overall score of 1 to 10 for the AI model's response.
-3. Your scoring should be as stringent as possible and follow the scoring rules below: Generally, the higher the quality of the model's response, the higher the score.
+3. Your scoring should be as stringent as possible and follow the scoring rules below:
 Generally, the higher the quality of the model's response, the higher the score.
 Creativity Scoring Guidelines:
 When the model's response fails to provide any innovative or unique content, the creativity score must be between 1 and 2;
@ -103,6 +105,160 @@ chn_base_prefix = """
 {'维度一': 打分, '维度二': 打分, ..., '综合得分': 打分}，例如：{'创造性': 9, '丰富度': 6, ..., '综合得分': 7}。\n
 """
 chn_base_prefix_score_with_ref = """
 你是一个擅长评价创作类文本质量的助手。
 请你以公正的评判者的身份，评估一个AI模型对创作类问题的回答的质量。你需要从下面的几个维度对回答进行评估：创造性，丰富度，满足用户需求，逻辑连贯性
 我们会给您提供一个创作类问题，和需要你评估的AI模型的回答以及一个参考答案。当你开始你的评估时，你需要遵守以下的流程：
 1. 从不同维度对AI模型的答案进行评价，给每一个维度一个1～10的分数。
 2. 最后，综合每个维度的评估，对AI模型的回答给出一个1～10的综合得分。
 3. 你的打分需要尽可能严格，并且要遵守下面的评分规则：
 总的来说，模型回答的质量越高，且严格符合用户的需求，则分数越高，不符合用户需求的回答分数越低。
 评分规则：
 创造性：
 - 未能够提供任何创新性或独特性内容时，得1-2分；
 - 提供部分原创性的创作内容，但创作质量较低时，得3-4分；
 - 基本均为创造性内容，但无太多新意，质量中等，得5-6分；
 - 具有新意，且内容质量较高时，得7-8分；
 - 非常新颖且质量极高时，相比参考答案明显创造性质量更高，得9-10分。
 丰富度：
 - 缺乏深度和广度，信息量非常有限，得1-2分；
 - 缺乏必要的深度，解释和实例较少，不够相关或不够详细，信息展现出较低的多样性时，得3-4分；
 - 但深度和广度有限，信息多样性一般，仅能从中获得基本所需的信息时，得5-6分；
 - 提供一定的深度，可从中获得必要及额外的有用信息时，得7-8分；
 - 提供了额外的深度和广度，包含多个相关的详细解释和实例，多样性很高，相比参考答案明显丰富度更高，得9-10分。
 满足用户需求：
 - 与需求完全不相关，特别是文体、主题完全不符，字数要求相差很大时，得1-2分；
 - 对需求的理解有限，只能在很小程度上提供相关信息，不太能够帮助用户解决问题，文体、主题、字数与题目要求相差较大时，得3-4分；
 - 部分理解需求，提供部分相关的回应，文体、主题基本符合需求，字数与要求相差不大时，得5-6分；
 - 较好地理解需求，提供较为相关的回应，文体、主题、字数符合问题要求时，得7-8分；
 - 精准地理解所有需求，提供高度相关和个性化的回应，文体、主题、字数完全符合需求时，相比参考答案更能满足用户需求，得9-10分。
 逻辑连贯性：
 - 完全不连贯，没有任何逻辑性，与问题或已知信息完全不匹配时，得1-2分；
 - 一定程度上逻辑连贯，但仍有不少逻辑错误或不一致之处时，得3-4分；
 - 大多数情况下逻辑连贯，错误较少，但在复杂情况下可能无法保持完全的连贯性时，得5-6分；
 - 逻辑出色，很好地处理复杂逻辑，错误非常少见，得7-8分；
 - 逻辑完美，无论问题多么复杂都有无懈可击的逻辑能力，相比参考答案明显逻辑连贯性更高，得9-10分。
 综合得分：
 - 存在与问题不相关，或事实错误，或生成有害内容时，得1-2分；
 - 没有严重错误，基本无害，但质量较低，没有满足要求，得3-4分；
 - 基本满足要求，但是在上述部分维度上表现较差，质量中等，得5-6分；
 - 在所有维度上表现良好，得7到8分；
 - 充分地解决了用户问题和所有需求，综合对比显著超过参考答案的情况下，得9-10分。
 请记住，你必须在你打分前进行评价和解释。在你对每个维度的解释之后，需要加上对该维度的打分。之后，在你回答的末尾，按照以下字典格式（包括括号）返回你所有的打分结果，并确保你的打分结果是整数：
 {'维度一': 打分, '维度二': 打分, ..., '综合得分': 打分}，例如：{'创造性': 9, '丰富度': 6, ..., '综合得分': 7}。\n
 """
 eng_base_prefix_score_with_ref = """
 You are an assistant skilled at evaluating the quality of creative text.
 Please evaluate the quality of an AI model's response to a creative question in the capacity of an impartial judge. You'll need to assess the response on the following dimensions: Creativity, Richness, User Demand Fulfillment, and Logical Coherence. We will provide you with a creative question and the AI model's response and a reference answer for your evaluation. As you begin your assessment, follow this process:
 1. Evaluate the AI model's answers on different dimensions, pointing out its strengths or weaknesses in each dimension and assigning a score of 1 to 10 for each.
 2. Finally, based on the assessments across dimensions, provide an overall score of 1 to 10 for the AI model's response.
 3. Your scoring should be as stringent as possible and follow the scoring rules below:
 In general, the higher the quality of the model's response and its strict adherence to user needs, the higher the score. Responses that do not meet user needs will receive lower scores.
 Scoring rules:
 Creativity:
 Scores 1-2 when there is no innovation or uniqueness in the content.
 Scores 3-4 when providing partially original content but with low creative quality.
 Scores 5-6 when mostly creative but lacks significant novelty, with moderate quality.
 Scores 7-8 when having novelty and high-quality content.
 Scores 9-10 when highly novel and of exceptional quality compared to the reference answer.
 Richness:
 Scores 1-2 when lacking depth and breadth, with very limited information.
 Scores 3-4 when limited in depth and breadth, with fewer explanations and examples, showing low diversity.
 Scores 5-6 when limited in depth and breadth but provides basic necessary information.
 Scores 7-8 when providing depth and useful additional information.
 Scores 9-10 when providing exceptional depth, breadth, and high diversity compared to the reference answer.
 User Demand Fulfillment:
 Scores 1-2 when completely irrelevant to the requirements, especially in style, theme, or significant word count difference.
 Scores 3-4 when limited understanding of requirements, providing minimal relevant information, unable to help solve the problem, and significant discrepancies in style, theme, or word count.
 Scores 5-6 when partially understanding requirements, providing somewhat relevant responses, with basic adherence to style, theme, and word count.
 Scores 7-8 when understanding requirements well, providing highly relevant responses, adhering to style, theme, and word count requirements.
 Scores 9-10 when accurately understanding all requirements, providing highly relevant and personalized responses, fully adhering to style, theme, and word count requirements, and significantly better meeting user needs than the reference answer.
 Logical Coherence:
 Scores 1-2 when entirely incoherent, lacking any logic, and not matching the question or known information.
 Scores 3-4 when somewhat coherent but with many logical errors or inconsistencies.
 Scores 5-6 when mostly coherent, with few errors, but may struggle to maintain complete coherence in complex situations.
 Scores 7-8 when excellent logical handling, very few errors.
 Scores 9-10 when flawless logic, impeccable in handling complexity, and significantly higher logical coherence compared to the reference answer.
 Overall Score:
 Scores 1-2 when irrelevant to the question, factually incorrect, or generates harmful content.
 Scores 3-4 when no serious errors, mostly harmless, but of low quality and does not meet requirements.
 Scores 5-6 when basically meeting requirements but performing poorly in some dimensions, with moderate quality.
 Scores 7-8 when performing well in all dimensions.
 Scores 9-10 when fully addressing user questions and all requirements, significantly surpassing the reference answer.
 Please remember, you must evaluate and explain before scoring. After your explanation for each dimension, add the score for that dimension. Finally, at the end of your response, in the format of the dictionary (including brackets), return all your scoring results, ensuring your scores are integers:
 {'Dimension One': Score, 'Dimension Two': Score, ..., 'Overall Score': Score}, for example: {'Creativity': 9, 'Richness': 6, ..., 'Overall Score': 7}.\n
 """
 compare_cn_prefix = """
 请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
 评分要求（重要性依次递减）:
 1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
 2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
 3. 好的回答必须具有创造性的词语和表达丰富度
 [用户问题]
 """
 compare_cn_suffix = """
 根据评分要求，在以下 2 个选项中做出选择:
 A. 回答1更好
 B. 回答2更好
 并提供你的解释原因。
 如果你认为回答1更好，你的输出应形如：
 选择：A
 原因：blahblah blahblah\n
 如果你认为回答2更好，你的输出应形如：
 选择：B
 原因：blahblah blahblah\n
 """
 compare_cn_prefix_4opt = """
 请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
 评分要求（重要性依次递减）:
 1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
 2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
 3. 好的回答必须具有创造性的词语和表达丰富度
 [用户问题]
 """
 compare_cn_suffix_4opt = """
 根据评分要求，在以下 2 个选项中做出选择:
 A. 回答1更好
 B. 回答2更好
 C. 回答1、2都好
 D. 回答1、2都不好
 并提供你的解释原因。
 如果你认为回答1更好，你的输出应形如：
 选择：A
 原因：blahblah blahblah\n
 如果你认为回答2更好，你的输出应形如：
 选择：B
 原因：blahblah blahblah\n
 如果你认为回答1、2都很好，你的输出应形如：
 选择：C
 原因：blahblah blahblah\n
 如果你认为回答1、2都不好，你的输出应形如：
 选择：D
 原因：blahblah blahblah\n
 """
 def prompt_construct(sample):
    lan = sample['others']['language']
@ -117,6 +273,41 @@ def prompt_construct(sample):
    return prompt, suffix
 def prompt_construct_score_with_ref(sample):
    lan = sample['others']['language']
    question = sample['question']
    ref = sample['ref']
    if lan == 'zh':
        prompt = chn_base_prefix_score_with_ref + '创作类问题：' + str(
            question) + '\n[参考答案开始]\n' + str(
                ref) + '\n[参考答案结束]\n' + '\n[模型回答开始]\n'
        suffix = '\n[模型回答结束]\n'
    elif lan == 'en':
        prompt = eng_base_prefix_score_with_ref + 'Creative Question: ' + str(
            question) + '\n[Reference start]\n' + str(
                ref) + '\n[Reference end]\n' + "\n[Model's response start]\n"
        suffix = "\n[Model's response end]\n"
    return prompt, suffix
 def prompt_construct_compare(sample):
    lan = sample['others']['language']
    question = sample['question']
    if lan == 'zh':
        prompt = compare_cn_prefix + str(question)
        suffix = compare_cn_suffix
    return prompt, suffix
 def prompt_construct_compare_4opt(sample):
    lan = sample['others']['language']
    question = sample['question']
    if lan == 'zh':
        prompt = compare_cn_prefix_4opt + str(question)
        suffix = compare_cn_suffix_4opt
    return prompt, suffix
@LOAD_DATASET.register_module()
 class CreationBenchDataset(SubjectiveCmpDataset):
@ -132,7 +323,12 @@ class CreationBenchDataset(SubjectiveCmpDataset):
                data['gpt4_prefix'] = prefix
                data['gpt4_suffix'] = suffix
            data['judge']['others'] = data['others']
-            # data['ref'] = data['others']['reference']
+            data['ref'] = data['others']['reference']
            data['score_with_ref_prefix'], data[
                'score_with_ref_suffix'] = prompt_construct_score_with_ref(
                    data)
            # data['compare_cn_prefix'], data['compare_cn_suffix'] = prompt_construct_compare(data)
            # data['compare_cn_prefix_4opt'], data['compare_cn_suffix_4opt'] = prompt_construct_compare_4opt(data)
            creation_dataset.append(data)
        dataset = Dataset.from_list(creation_dataset)
        return dataset