[Feature] add subject ir dataset (#755)

* add subject ir * Add ir dataset * Add ir dataset
2025-05-30 16:03:24 +08:00 · 2024-01-05 20:00:57 +08:00 · 2024-01-05 20:00:57 +08:00 · 2163f9398f
commit 2163f9398f
parent be369c3e06
8 changed files with 363 additions and 4 deletions
--- a/configs/datasets/subjective_ir/ir_judgedby_autoj.py
+++ b/configs/datasets/subjective_ir/ir_judgedby_autoj.py
@ -0,0 +1,71 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import IRDataset
 subjective_reader_cfg = dict(
    input_columns=['question', 'capability', 'ref'],
    output_column='judge',
    )
 subjective_all_sets = [
    "information_retrieval",
 ]
 data_path ="data/subjective/"
 subjective_datasets = []
 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt="{question}"
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
        )
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt = """为上传的针对给定用户问题的回应撰写评论, 并为该回复打分:
 [BEGIN DATA]
 ***
 [用户问询]: {question}
 ***
 [回应]: {prediction}
 ***
 [参考答案]: {ref}
 ***
 [END DATA]
 请根据参考答案为这个回应撰写评论. 在这之后, 你应该按照如下格式给这个回应一个最终的1-10范围的评分: "[[评分]]", 例如: "评分: [[5]]"."""
                    ),
                ]),
            ),
        ),
        pred_role="BOT",
    )
    subjective_datasets.append(
        dict(
            abbr=f"{_name}",
            type=IRDataset,
            path=data_path,
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg
        ))
--- a/configs/datasets/subjective_ir/ir_judgedby_gpt4.py
+++ b/configs/datasets/subjective_ir/ir_judgedby_gpt4.py
@ -0,0 +1,59 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import IRDataset
 subjective_reader_cfg = dict(
    input_columns=['question', 'capability', 'gpt4_prefix', 'gpt4_suffix', 'ref'],
    output_column='judge',
    )
 subjective_all_sets = [
    "information_retrieval",
 ]
 data_path ="data/subjective/"
 subjective_datasets = []
 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt="{question}"
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
        )
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt = "{gpt4_prefix}{prediction}{gpt4_suffix}"
                    ),
                ]),
            ),
        ),
        pred_role="BOT",
    )
    subjective_datasets.append(
        dict(
            abbr=f"{_name}",
            type=IRDataset,
            path=data_path,
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg
        ))
--- a/configs/eval_subjective_alignbench.py
+++ b/configs/eval_subjective_alignbench.py
@ -79,7 +79,7 @@ eval = dict(
 )
 summarizer = dict(
-    type=AlignmentBenchSummarizer,
+    type=AlignmentBenchSummarizer, judge_type = 'general'
 )
 work_dir = 'outputs/alignment_bench/'
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -84,6 +84,7 @@ from .subject_alignmentbench import AlignmentBenchDataset  # noqa: F401, F403
 from .subject_corev2 import Corev2Dataset  # noqa: F401, F403
 from .subject_creationbench import CreationBenchDataset  # noqa: F401, F403
 from .subject_creationv01 import Creationv01Dataset  # noqa: F401, F403
 from .subject_ir import IRDataset  # noqa: F401, F403
 from .subject_multiround import MultiroundDataset  # noqa: F401, F403
 from .subjective_cmp import SubjectiveCmpDataset  # noqa: F401, F403
 from .summedits import *  # noqa: F401, F403
--- a/opencompass/datasets/subject_ir.py
+++ b/opencompass/datasets/subject_ir.py
@ -0,0 +1,88 @@
 # flake8: noqa: E501
 import json
 import os.path as osp
 import re
 from typing import Optional
 from datasets import Dataset, DatasetDict
 from opencompass.registry import LOAD_DATASET
 from .subjective_cmp import SubjectiveCmpDataset
 eng_base_prefix = """
 You are a judger. Please impartially judge whether an AI model's response to a question is correct based on the reference answers. You need to provide a conclusion of "correct" or "wrong," followed by the corresponding reasoning.
 Note that since the reference answer is a candidate list, the AI model's response only needs to align with one item in the list to be deemed "correct."
 Your judgment must strictly adhere to the following format:
 Conclusion: [[Correct]]
 Reasoning: xxx.
 Conclusion: [[Wrong]]
 Reasoning: xxx.
 [Question Start]
 {question}
 [Question End]
 [Reference Answers Start]
 {ref}
 [Reference Answers End]
 [Model Response Start]
 """
 chn_base_prefix = """
 你是一个评判者，请你基于参考答案，公正地评判一个AI模型对于问题的回答是否正确。你需要给出“对或错”的结论，然后再给出相应的理由。
 请注意，由于参考答案是一个候选列表，因此AI模型的回答只要符合列表中的某一项即可判断为“对”。
 你的评判必须严格遵守以下格式：
 结论：[[对]]
 理由：xxx。
 结论：[[错]]
 理由：xxx。
 [问题开始]
 {question}
 [问题结束]
 [参考答案开始]
 {ref}
 [参考答案结束]
 [模型回答开始]
 """
 def prompt_construct(sample):
    lan = sample['others']['lan']
    question = sample['question']
    if lan == 'zh':
        prefix = chn_base_prefix.format(question=sample['question'],
                                        ref=str(sample['others']['answers']))
        suffix = '\n[模型回答结束]\n'
    elif lan == 'en':
        prefix = eng_base_prefix.format(question=sample['question'],
                                        ref=str(sample['others']['answers']))
        suffix = '\n[Model Response End]\n'
    return prefix, suffix
@LOAD_DATASET.register_module()
 class IRDataset(SubjectiveCmpDataset):
    def load(
        self,
        path: str,
        name: str,
    ):
        dataset = list(super().load(path, name))
        subject_dataset = []
        for data in dataset:
            data['gpt4_prefix'], data['gpt4_suffix'] = prompt_construct(data)
            data['judge']['others'] = data['others']
            data['ref'] = str(data['others']['answers'])
            subject_dataset.append(data)
        dataset = Dataset.from_list(subject_dataset)
        return dataset
--- a/opencompass/summarizers/init.py
+++ b/opencompass/summarizers/init.py
@ -5,5 +5,6 @@ from .corev2 import Corev2Summarizer  # noqa: F401
 from .creationbench import CreationBenchSummarizer
 from .creationv01 import Creationv01Summarizer  # noqa: F401
 from .default import DefaultSummarizer  # noqa: F401
 from .information_retrival import IRSummarizer  # noqa: F401
 from .multiround import MultiroundSummarizer  # noqa: F401
 from .subjective import SubjectiveSummarizer  # noqa: F401
--- a/opencompass/summarizers/alignmentbench.py
+++ b/opencompass/summarizers/alignmentbench.py
@ -128,7 +128,7 @@ def get_dimension_results(judged_answers, references, fout, fout_flag, model):
        writer = csv.writer(csvfile)
        if fout_flag == 0:
            writer.writerow(['模型'] + columns)
-            fout_flag += 1
+
        for row in rows:
            writer.writerow([row] +
                            [scores[row][column] for column in columns])
@ -184,7 +184,6 @@ def get_capability_results(judged_answers,
                sub_header.extend([category + '总分'])
                sub_header.extend(sub_categories)
            writer.writerow(sub_header)
            fout_flag += 1
        row = [model]
        row.append(scores[model]['总分'])
@ -203,7 +202,7 @@ class AlignmentBenchSummarizer:
            It's expected to be filled out at runtime.
    """
-    def __init__(self, config: ConfigDict, judge_type: str) -> None:
+    def __init__(self, config: ConfigDict, judge_type='general') -> None:
        self.tasks = []
        self.cfg = config
        self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
@ -252,8 +251,10 @@ class AlignmentBenchSummarizer:
                    if self.judge_type == 'general':
                        get_dimension_results(judged_answers, references, fout,
                                              fout_flag, model)
                        fout_flag += 1
                    get_capability_results(judged_answers, references, fout2,
                                           fout_flag2, model, self.category)
                    fout_flag2 += 1
            else:
                print(subdir_path + ' is not exist! please check!')
        if self.judge_type == 'general':
--- a/opencompass/summarizers/information_retrival.py
+++ b/opencompass/summarizers/information_retrival.py
@ -0,0 +1,138 @@
 # flake8: noqa: E501
 import csv
 import os
 import os.path as osp
 import re
 from collections import defaultdict
 from datetime import datetime
 import numpy as np
 from mmengine import ConfigDict
 try:
    from prettytable import from_csv
 except ImportError:
    from_csv = None
 from opencompass.utils import model_abbr_from_cfg
 from .subjective_post_process import post_process_autoj
 from .utils import get_judgeanswer_and_reference, get_outdir
 def post_process_ir(judgement: str):
    """Input a string like below:
    Conclusion: [[Correct]]\nReasoning: xxx
    and extract the score
    """
    matches = re.findall(r'\[\[(.*?)\]\]', judgement)
    if matches:
        matches = matches[0]
        if matches in ['Correct', 'Wrong', '对', '错']:
            if matches == 'Correct' or matches == '对':
                return {'score': 1}
            else:
                return {'score': 0}
        else:
            return None
    else:
        return None
 def get_results(
    judged_answers,
    references,
    fout,
    fout_flag,
    model,
 ):
    capability_ratings = defaultdict(int)
    capability_counts = defaultdict(int)
    for ans, ref in zip(judged_answers, references):
        lan = ref['others']['lan']
        capability_ratings['total'] += ans['score']
        capability_counts['total'] += 1
        capability_ratings[lan] += ans['score']
        capability_counts[lan] += 1
    capability_avg_ratings = defaultdict(float)
    for capability, total_score in capability_ratings.items():
        capability_avg_ratings[
            capability] = total_score / capability_counts[capability]
    scores = {model: capability_avg_ratings}
    with open(fout, 'a+', newline='') as csvfile:
        writer = csv.writer(csvfile)
        if fout_flag == 0:
            num_header = [str(i) for i in range(4)]
            writer.writerow(num_header)
            header = ['模型']
            for category in capability_avg_ratings:
                header.append(category)
            writer.writerow(header)
        row = [model]
        for category in capability_avg_ratings:
            row.append(scores[model][category])
        writer.writerow(row)
 class IRSummarizer:
    """Do the subjectivity analyze based on evaluation results.
    Args:
        config (ConfigDict): The configuration object of the evaluation task.
            It's expected to be filled out at runtime.
    """
    def __init__(self, config: ConfigDict, judge_type='autoj') -> None:
        self.tasks = []
        self.cfg = config
        self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
        self.eval_model_abbrs = [
            model_abbr_from_cfg(model) for model in self.eval_model_cfgs
        ]
        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
        self.judge_type = judge_type
        assert self.judge_type in ['general', 'autoj']
        self.judge_map = {
            'general': post_process_ir,
            'autoj': post_process_autoj,
        }
        self.judge_function = self.judge_map[self.judge_type]
    def summarize(self,
                  time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
        """Summarize the subjectivity analysis based on evaluation results.
        Args:
            time_str (str): Timestamp for file naming.
        Returns:
            pd.DataFrame: The summary results.
        """
        dataset_cfgs = self.cfg['datasets']
        output_dir, results_folder = get_outdir(self.cfg, time_str)
        fout_flag = 0
        for eval_model_abbr in self.eval_model_abbrs:
            subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
            subdir_path = os.path.join(results_folder, subdir)
            if os.path.isdir(subdir_path):
                model, judge_model = eval_model_abbr, self.judge_abbr
                fout = osp.join(output_dir,
                                'judged-by--' + judge_model + '.csv')
                for dataset in dataset_cfgs:
                    judged_answers, references = get_judgeanswer_and_reference(
                        dataset, subdir_path, self.judge_function)
                    get_results(judged_answers, references, fout, fout_flag,
                                model)
                    fout_flag += 1
            else:
                print(subdir_path + ' is not exist! please check!')
        with open(fout, 'r') as f:
            x = from_csv(f)
        print(x)