[Feature] add subject ir dataset (#755)

* add subject ir

* Add ir dataset

* Add ir dataset
This commit is contained in:
bittersweet1999 2024-01-05 20:00:57 +08:00 committed by GitHub
parent be369c3e06
commit 2163f9398f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 363 additions and 4 deletions

View File

@ -0,0 +1,71 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import IRDataset
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'ref'],
output_column='judge',
)
subjective_all_sets = [
"information_retrieval",
]
data_path ="data/subjective/"
subjective_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = """为上传的针对给定用户问题的回应撰写评论, 并为该回复打分:
[BEGIN DATA]
***
[用户问询]: {question}
***
[回应]: {prediction}
***
[参考答案]: {ref}
***
[END DATA]
请根据参考答案为这个回应撰写评论. 在这之后, 你应该按照如下格式给这个回应一个最终的1-10范围的评分: "[[评分]]", 例如: "评分: [[5]]"."""
),
]),
),
),
pred_role="BOT",
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
type=IRDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
))

View File

@ -0,0 +1,59 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import IRDataset
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'gpt4_prefix', 'gpt4_suffix', 'ref'],
output_column='judge',
)
subjective_all_sets = [
"information_retrieval",
]
data_path ="data/subjective/"
subjective_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = "{gpt4_prefix}{prediction}{gpt4_suffix}"
),
]),
),
),
pred_role="BOT",
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
type=IRDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
))

View File

@ -79,7 +79,7 @@ eval = dict(
)
summarizer = dict(
type=AlignmentBenchSummarizer,
type=AlignmentBenchSummarizer, judge_type = 'general'
)
work_dir = 'outputs/alignment_bench/'

View File

@ -84,6 +84,7 @@ from .subject_alignmentbench import AlignmentBenchDataset # noqa: F401, F403
from .subject_corev2 import Corev2Dataset # noqa: F401, F403
from .subject_creationbench import CreationBenchDataset # noqa: F401, F403
from .subject_creationv01 import Creationv01Dataset # noqa: F401, F403
from .subject_ir import IRDataset # noqa: F401, F403
from .subject_multiround import MultiroundDataset # noqa: F401, F403
from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403
from .summedits import * # noqa: F401, F403

View File

@ -0,0 +1,88 @@
# flake8: noqa: E501
import json
import os.path as osp
import re
from typing import Optional
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from .subjective_cmp import SubjectiveCmpDataset
eng_base_prefix = """
You are a judger. Please impartially judge whether an AI model's response to a question is correct based on the reference answers. You need to provide a conclusion of "correct" or "wrong," followed by the corresponding reasoning.
Note that since the reference answer is a candidate list, the AI model's response only needs to align with one item in the list to be deemed "correct."
Your judgment must strictly adhere to the following format:
Conclusion: [[Correct]]
Reasoning: xxx.
Conclusion: [[Wrong]]
Reasoning: xxx.
[Question Start]
{question}
[Question End]
[Reference Answers Start]
{ref}
[Reference Answers End]
[Model Response Start]
"""
chn_base_prefix = """
你是一个评判者请你基于参考答案公正地评判一个AI模型对于问题的回答是否正确你需要给出对或错的结论然后再给出相应的理由
请注意由于参考答案是一个候选列表因此AI模型的回答只要符合列表中的某一项即可判断为
你的评判必须严格遵守以下格式
结论[[]]
理由xxx
结论[[]]
理由xxx
[问题开始]
{question}
[问题结束]
[参考答案开始]
{ref}
[参考答案结束]
[模型回答开始]
"""
def prompt_construct(sample):
lan = sample['others']['lan']
question = sample['question']
if lan == 'zh':
prefix = chn_base_prefix.format(question=sample['question'],
ref=str(sample['others']['answers']))
suffix = '\n[模型回答结束]\n'
elif lan == 'en':
prefix = eng_base_prefix.format(question=sample['question'],
ref=str(sample['others']['answers']))
suffix = '\n[Model Response End]\n'
return prefix, suffix
@LOAD_DATASET.register_module()
class IRDataset(SubjectiveCmpDataset):
def load(
self,
path: str,
name: str,
):
dataset = list(super().load(path, name))
subject_dataset = []
for data in dataset:
data['gpt4_prefix'], data['gpt4_suffix'] = prompt_construct(data)
data['judge']['others'] = data['others']
data['ref'] = str(data['others']['answers'])
subject_dataset.append(data)
dataset = Dataset.from_list(subject_dataset)
return dataset

View File

@ -5,5 +5,6 @@ from .corev2 import Corev2Summarizer # noqa: F401
from .creationbench import CreationBenchSummarizer
from .creationv01 import Creationv01Summarizer # noqa: F401
from .default import DefaultSummarizer # noqa: F401
from .information_retrival import IRSummarizer # noqa: F401
from .multiround import MultiroundSummarizer # noqa: F401
from .subjective import SubjectiveSummarizer # noqa: F401

View File

@ -128,7 +128,7 @@ def get_dimension_results(judged_answers, references, fout, fout_flag, model):
writer = csv.writer(csvfile)
if fout_flag == 0:
writer.writerow(['模型'] + columns)
fout_flag += 1
for row in rows:
writer.writerow([row] +
[scores[row][column] for column in columns])
@ -184,7 +184,6 @@ def get_capability_results(judged_answers,
sub_header.extend([category + '总分'])
sub_header.extend(sub_categories)
writer.writerow(sub_header)
fout_flag += 1
row = [model]
row.append(scores[model]['总分'])
@ -203,7 +202,7 @@ class AlignmentBenchSummarizer:
It's expected to be filled out at runtime.
"""
def __init__(self, config: ConfigDict, judge_type: str) -> None:
def __init__(self, config: ConfigDict, judge_type='general') -> None:
self.tasks = []
self.cfg = config
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
@ -252,8 +251,10 @@ class AlignmentBenchSummarizer:
if self.judge_type == 'general':
get_dimension_results(judged_answers, references, fout,
fout_flag, model)
fout_flag += 1
get_capability_results(judged_answers, references, fout2,
fout_flag2, model, self.category)
fout_flag2 += 1
else:
print(subdir_path + ' is not exist! please check!')
if self.judge_type == 'general':

View File

@ -0,0 +1,138 @@
# flake8: noqa: E501
import csv
import os
import os.path as osp
import re
from collections import defaultdict
from datetime import datetime
import numpy as np
from mmengine import ConfigDict
try:
from prettytable import from_csv
except ImportError:
from_csv = None
from opencompass.utils import model_abbr_from_cfg
from .subjective_post_process import post_process_autoj
from .utils import get_judgeanswer_and_reference, get_outdir
def post_process_ir(judgement: str):
"""Input a string like below:
Conclusion: [[Correct]]\nReasoning: xxx
and extract the score
"""
matches = re.findall(r'\[\[(.*?)\]\]', judgement)
if matches:
matches = matches[0]
if matches in ['Correct', 'Wrong', '', '']:
if matches == 'Correct' or matches == '':
return {'score': 1}
else:
return {'score': 0}
else:
return None
else:
return None
def get_results(
judged_answers,
references,
fout,
fout_flag,
model,
):
capability_ratings = defaultdict(int)
capability_counts = defaultdict(int)
for ans, ref in zip(judged_answers, references):
lan = ref['others']['lan']
capability_ratings['total'] += ans['score']
capability_counts['total'] += 1
capability_ratings[lan] += ans['score']
capability_counts[lan] += 1
capability_avg_ratings = defaultdict(float)
for capability, total_score in capability_ratings.items():
capability_avg_ratings[
capability] = total_score / capability_counts[capability]
scores = {model: capability_avg_ratings}
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
if fout_flag == 0:
num_header = [str(i) for i in range(4)]
writer.writerow(num_header)
header = ['模型']
for category in capability_avg_ratings:
header.append(category)
writer.writerow(header)
row = [model]
for category in capability_avg_ratings:
row.append(scores[model][category])
writer.writerow(row)
class IRSummarizer:
"""Do the subjectivity analyze based on evaluation results.
Args:
config (ConfigDict): The configuration object of the evaluation task.
It's expected to be filled out at runtime.
"""
def __init__(self, config: ConfigDict, judge_type='autoj') -> None:
self.tasks = []
self.cfg = config
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.eval_model_abbrs = [
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
]
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
self.judge_type = judge_type
assert self.judge_type in ['general', 'autoj']
self.judge_map = {
'general': post_process_ir,
'autoj': post_process_autoj,
}
self.judge_function = self.judge_map[self.judge_type]
def summarize(self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
"""Summarize the subjectivity analysis based on evaluation results.
Args:
time_str (str): Timestamp for file naming.
Returns:
pd.DataFrame: The summary results.
"""
dataset_cfgs = self.cfg['datasets']
output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag = 0
for eval_model_abbr in self.eval_model_abbrs:
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path):
model, judge_model = eval_model_abbr, self.judge_abbr
fout = osp.join(output_dir,
'judged-by--' + judge_model + '.csv')
for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference(
dataset, subdir_path, self.judge_function)
get_results(judged_answers, references, fout, fout_flag,
model)
fout_flag += 1
else:
print(subdir_path + ' is not exist! please check!')
with open(fout, 'r') as f:
x = from_csv(f)
print(x)