[Feature] add subject ir dataset (#755)

* add subject ir * Add ir dataset * Add ir dataset
2025-05-30 16:03:24 +08:00 · 2024-01-05 20:00:57 +08:00 · 2024-01-05 20:00:57 +08:00 · 2163f9398f
commit 2163f9398f
parent be369c3e06
8 changed files with 363 additions and 4 deletions
--- a/configs/datasets/subjective_ir/ir_judgedby_autoj.py
+++ b/configs/datasets/subjective_ir/ir_judgedby_autoj.py
@ -0,0 +1,71 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import IRDataset
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'capability', 'ref'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    "information_retrieval",
+]
+data_path ="data/subjective/"
+
+subjective_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt="{question}"
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = """为上传的针对给定用户问题的回应撰写评论, 并为该回复打分:
+
+[BEGIN DATA]
+***
+[用户问询]: {question}
+***
+[回应]: {prediction}
+***
+[参考答案]: {ref}
+***
+[END DATA]
+
+请根据参考答案为这个回应撰写评论. 在这之后, 你应该按照如下格式给这个回应一个最终的1-10范围的评分: "[[评分]]", 例如: "评分: [[5]]"."""
+                    ),
+                ]),
+            ),
+        ),
+        pred_role="BOT",
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=f"{_name}",
+            type=IRDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
--- a/configs/datasets/subjective_ir/ir_judgedby_gpt4.py
+++ b/configs/datasets/subjective_ir/ir_judgedby_gpt4.py
@ -0,0 +1,59 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import IRDataset
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'capability', 'gpt4_prefix', 'gpt4_suffix', 'ref'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    "information_retrieval",
+]
+data_path ="data/subjective/"
+
+subjective_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt="{question}"
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = "{gpt4_prefix}{prediction}{gpt4_suffix}"
+                    ),
+                ]),
+            ),
+        ),
+        pred_role="BOT",
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=f"{_name}",
+            type=IRDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
--- a/configs/eval_subjective_alignbench.py
+++ b/configs/eval_subjective_alignbench.py
@ -79,7 +79,7 @@ eval = dict(
 )

 summarizer = dict(
-    type=AlignmentBenchSummarizer,
+    type=AlignmentBenchSummarizer, judge_type = 'general'
 )

 work_dir = 'outputs/alignment_bench/'
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -84,6 +84,7 @@ from .subject_alignmentbench import AlignmentBenchDataset  # noqa: F401, F403
 from .subject_corev2 import Corev2Dataset  # noqa: F401, F403
 from .subject_creationbench import CreationBenchDataset  # noqa: F401, F403
 from .subject_creationv01 import Creationv01Dataset  # noqa: F401, F403
+from .subject_ir import IRDataset  # noqa: F401, F403
 from .subject_multiround import MultiroundDataset  # noqa: F401, F403
 from .subjective_cmp import SubjectiveCmpDataset  # noqa: F401, F403
 from .summedits import *  # noqa: F401, F403
--- a/opencompass/datasets/subject_ir.py
+++ b/opencompass/datasets/subject_ir.py
@ -0,0 +1,88 @@
+# flake8: noqa: E501
+import json
+import os.path as osp
+import re
+from typing import Optional
+
+from datasets import Dataset, DatasetDict
+
+from opencompass.registry import LOAD_DATASET
+
+from .subjective_cmp import SubjectiveCmpDataset
+
+eng_base_prefix = """
+You are a judger. Please impartially judge whether an AI model's response to a question is correct based on the reference answers. You need to provide a conclusion of "correct" or "wrong," followed by the corresponding reasoning.
+
+Note that since the reference answer is a candidate list, the AI model's response only needs to align with one item in the list to be deemed "correct."
+
+Your judgment must strictly adhere to the following format:
+Conclusion: [[Correct]]
+Reasoning: xxx.
+
+Conclusion: [[Wrong]]
+Reasoning: xxx.
+
+[Question Start]
+{question}
+[Question End]
+
+[Reference Answers Start]
+{ref}
+[Reference Answers End]
+
+[Model Response Start]
+"""
+
+chn_base_prefix = """
+你是一个评判者，请你基于参考答案，公正地评判一个AI模型对于问题的回答是否正确。你需要给出“对或错”的结论，然后再给出相应的理由。
+请注意，由于参考答案是一个候选列表，因此AI模型的回答只要符合列表中的某一项即可判断为“对”。
+你的评判必须严格遵守以下格式：
+结论：[[对]]
+理由：xxx。
+
+结论：[[错]]
+理由：xxx。
+
+[问题开始]
+{question}
+[问题结束]
+
+[参考答案开始]
+{ref}
+[参考答案结束]
+
+[模型回答开始]
+"""
+
+
+def prompt_construct(sample):
+    lan = sample['others']['lan']
+    question = sample['question']
+    if lan == 'zh':
+        prefix = chn_base_prefix.format(question=sample['question'],
+                                        ref=str(sample['others']['answers']))
+        suffix = '\n[模型回答结束]\n'
+    elif lan == 'en':
+        prefix = eng_base_prefix.format(question=sample['question'],
+                                        ref=str(sample['others']['answers']))
+        suffix = '\n[Model Response End]\n'
+    return prefix, suffix
+
+
+@LOAD_DATASET.register_module()
+class IRDataset(SubjectiveCmpDataset):
+
+    def load(
+        self,
+        path: str,
+        name: str,
+    ):
+        dataset = list(super().load(path, name))
+        subject_dataset = []
+        for data in dataset:
+            data['gpt4_prefix'], data['gpt4_suffix'] = prompt_construct(data)
+            data['judge']['others'] = data['others']
+            data['ref'] = str(data['others']['answers'])
+            subject_dataset.append(data)
+        dataset = Dataset.from_list(subject_dataset)
+        return dataset
--- a/opencompass/summarizers/init.py
+++ b/opencompass/summarizers/init.py
@ -5,5 +5,6 @@ from .corev2 import Corev2Summarizer  # noqa: F401
 from .creationbench import CreationBenchSummarizer
 from .creationv01 import Creationv01Summarizer  # noqa: F401
 from .default import DefaultSummarizer  # noqa: F401
+from .information_retrival import IRSummarizer  # noqa: F401
 from .multiround import MultiroundSummarizer  # noqa: F401
 from .subjective import SubjectiveSummarizer  # noqa: F401
--- a/opencompass/summarizers/alignmentbench.py
+++ b/opencompass/summarizers/alignmentbench.py
@ -128,7 +128,7 @@ def get_dimension_results(judged_answers, references, fout, fout_flag, model):
        writer = csv.writer(csvfile)
        if fout_flag == 0:
            writer.writerow(['模型'] + columns)
-            fout_flag += 1
+
        for row in rows:
            writer.writerow([row] +
                            [scores[row][column] for column in columns])
@ -184,7 +184,6 @@ def get_capability_results(judged_answers,
                sub_header.extend([category + '总分'])
                sub_header.extend(sub_categories)
            writer.writerow(sub_header)
-            fout_flag += 1

        row = [model]
        row.append(scores[model]['总分'])
@ -203,7 +202,7 @@ class AlignmentBenchSummarizer:
            It's expected to be filled out at runtime.
    """

-    def __init__(self, config: ConfigDict, judge_type: str) -> None:
+    def __init__(self, config: ConfigDict, judge_type='general') -> None:
        self.tasks = []
        self.cfg = config
        self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
@ -252,8 +251,10 @@ class AlignmentBenchSummarizer:
                    if self.judge_type == 'general':
                        get_dimension_results(judged_answers, references, fout,
                                              fout_flag, model)
+                        fout_flag += 1
                    get_capability_results(judged_answers, references, fout2,
                                           fout_flag2, model, self.category)
+                    fout_flag2 += 1
            else:
                print(subdir_path + ' is not exist! please check!')
        if self.judge_type == 'general':
--- a/opencompass/summarizers/information_retrival.py
+++ b/opencompass/summarizers/information_retrival.py
@ -0,0 +1,138 @@
+# flake8: noqa: E501
+import csv
+import os
+import os.path as osp
+import re
+from collections import defaultdict
+from datetime import datetime
+
+import numpy as np
+from mmengine import ConfigDict
+
+try:
+    from prettytable import from_csv
+except ImportError:
+    from_csv = None
+
+from opencompass.utils import model_abbr_from_cfg
+
+from .subjective_post_process import post_process_autoj
+from .utils import get_judgeanswer_and_reference, get_outdir
+
+
+def post_process_ir(judgement: str):
+    """Input a string like below:
+
+    Conclusion: [[Correct]]\nReasoning: xxx
+    and extract the score
+    """
+    matches = re.findall(r'\[\[(.*?)\]\]', judgement)
+    if matches:
+        matches = matches[0]
+        if matches in ['Correct', 'Wrong', '对', '错']:
+            if matches == 'Correct' or matches == '对':
+                return {'score': 1}
+            else:
+                return {'score': 0}
+        else:
+            return None
+    else:
+        return None
+
+
+def get_results(
+    judged_answers,
+    references,
+    fout,
+    fout_flag,
+    model,
+):
+    capability_ratings = defaultdict(int)
+    capability_counts = defaultdict(int)
+    for ans, ref in zip(judged_answers, references):
+        lan = ref['others']['lan']
+        capability_ratings['total'] += ans['score']
+        capability_counts['total'] += 1
+        capability_ratings[lan] += ans['score']
+        capability_counts[lan] += 1
+
+    capability_avg_ratings = defaultdict(float)
+
+    for capability, total_score in capability_ratings.items():
+        capability_avg_ratings[
+            capability] = total_score / capability_counts[capability]
+
+    scores = {model: capability_avg_ratings}
+
+    with open(fout, 'a+', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        if fout_flag == 0:
+            num_header = [str(i) for i in range(4)]
+            writer.writerow(num_header)
+
+            header = ['模型']
+            for category in capability_avg_ratings:
+                header.append(category)
+            writer.writerow(header)
+
+        row = [model]
+        for category in capability_avg_ratings:
+            row.append(scores[model][category])
+        writer.writerow(row)
+
+
+class IRSummarizer:
+    """Do the subjectivity analyze based on evaluation results.
+
+    Args:
+        config (ConfigDict): The configuration object of the evaluation task.
+            It's expected to be filled out at runtime.
+    """
+
+    def __init__(self, config: ConfigDict, judge_type='autoj') -> None:
+        self.tasks = []
+        self.cfg = config
+        self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
+        self.eval_model_abbrs = [
+            model_abbr_from_cfg(model) for model in self.eval_model_cfgs
+        ]
+        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
+        self.judge_type = judge_type
+        assert self.judge_type in ['general', 'autoj']
+        self.judge_map = {
+            'general': post_process_ir,
+            'autoj': post_process_autoj,
+        }
+        self.judge_function = self.judge_map[self.judge_type]
+
+    def summarize(self,
+                  time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
+        """Summarize the subjectivity analysis based on evaluation results.
+
+        Args:
+            time_str (str): Timestamp for file naming.
+
+        Returns:
+            pd.DataFrame: The summary results.
+        """
+        dataset_cfgs = self.cfg['datasets']
+        output_dir, results_folder = get_outdir(self.cfg, time_str)
+        fout_flag = 0
+        for eval_model_abbr in self.eval_model_abbrs:
+            subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
+            subdir_path = os.path.join(results_folder, subdir)
+            if os.path.isdir(subdir_path):
+                model, judge_model = eval_model_abbr, self.judge_abbr
+                fout = osp.join(output_dir,
+                                'judged-by--' + judge_model + '.csv')
+                for dataset in dataset_cfgs:
+                    judged_answers, references = get_judgeanswer_and_reference(
+                        dataset, subdir_path, self.judge_function)
+                    get_results(judged_answers, references, fout, fout_flag,
+                                model)
+                    fout_flag += 1
+            else:
+                print(subdir_path + ' is not exist! please check!')
+        with open(fout, 'r') as f:
+            x = from_csv(f)
+        print(x)