mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] add subject ir dataset (#755)
* add subject ir * Add ir dataset * Add ir dataset
This commit is contained in:
parent
be369c3e06
commit
2163f9398f
71
configs/datasets/subjective_ir/ir_judgedby_autoj.py
Normal file
71
configs/datasets/subjective_ir/ir_judgedby_autoj.py
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||||
|
from opencompass.datasets import IRDataset
|
||||||
|
|
||||||
|
subjective_reader_cfg = dict(
|
||||||
|
input_columns=['question', 'capability', 'ref'],
|
||||||
|
output_column='judge',
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_all_sets = [
|
||||||
|
"information_retrieval",
|
||||||
|
]
|
||||||
|
data_path ="data/subjective/"
|
||||||
|
|
||||||
|
subjective_datasets = []
|
||||||
|
|
||||||
|
for _name in subjective_all_sets:
|
||||||
|
subjective_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt="{question}"
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LMEvaluator,
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt = """为上传的针对给定用户问题的回应撰写评论, 并为该回复打分:
|
||||||
|
|
||||||
|
[BEGIN DATA]
|
||||||
|
***
|
||||||
|
[用户问询]: {question}
|
||||||
|
***
|
||||||
|
[回应]: {prediction}
|
||||||
|
***
|
||||||
|
[参考答案]: {ref}
|
||||||
|
***
|
||||||
|
[END DATA]
|
||||||
|
|
||||||
|
请根据参考答案为这个回应撰写评论. 在这之后, 你应该按照如下格式给这个回应一个最终的1-10范围的评分: "[[评分]]", 例如: "评分: [[5]]"."""
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
pred_role="BOT",
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f"{_name}",
|
||||||
|
type=IRDataset,
|
||||||
|
path=data_path,
|
||||||
|
name=_name,
|
||||||
|
reader_cfg=subjective_reader_cfg,
|
||||||
|
infer_cfg=subjective_infer_cfg,
|
||||||
|
eval_cfg=subjective_eval_cfg
|
||||||
|
))
|
59
configs/datasets/subjective_ir/ir_judgedby_gpt4.py
Normal file
59
configs/datasets/subjective_ir/ir_judgedby_gpt4.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||||
|
from opencompass.datasets import IRDataset
|
||||||
|
|
||||||
|
subjective_reader_cfg = dict(
|
||||||
|
input_columns=['question', 'capability', 'gpt4_prefix', 'gpt4_suffix', 'ref'],
|
||||||
|
output_column='judge',
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_all_sets = [
|
||||||
|
"information_retrieval",
|
||||||
|
]
|
||||||
|
data_path ="data/subjective/"
|
||||||
|
|
||||||
|
subjective_datasets = []
|
||||||
|
|
||||||
|
for _name in subjective_all_sets:
|
||||||
|
subjective_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt="{question}"
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LMEvaluator,
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt = "{gpt4_prefix}{prediction}{gpt4_suffix}"
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
pred_role="BOT",
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f"{_name}",
|
||||||
|
type=IRDataset,
|
||||||
|
path=data_path,
|
||||||
|
name=_name,
|
||||||
|
reader_cfg=subjective_reader_cfg,
|
||||||
|
infer_cfg=subjective_infer_cfg,
|
||||||
|
eval_cfg=subjective_eval_cfg
|
||||||
|
))
|
@ -79,7 +79,7 @@ eval = dict(
|
|||||||
)
|
)
|
||||||
|
|
||||||
summarizer = dict(
|
summarizer = dict(
|
||||||
type=AlignmentBenchSummarizer,
|
type=AlignmentBenchSummarizer, judge_type = 'general'
|
||||||
)
|
)
|
||||||
|
|
||||||
work_dir = 'outputs/alignment_bench/'
|
work_dir = 'outputs/alignment_bench/'
|
||||||
|
@ -84,6 +84,7 @@ from .subject_alignmentbench import AlignmentBenchDataset # noqa: F401, F403
|
|||||||
from .subject_corev2 import Corev2Dataset # noqa: F401, F403
|
from .subject_corev2 import Corev2Dataset # noqa: F401, F403
|
||||||
from .subject_creationbench import CreationBenchDataset # noqa: F401, F403
|
from .subject_creationbench import CreationBenchDataset # noqa: F401, F403
|
||||||
from .subject_creationv01 import Creationv01Dataset # noqa: F401, F403
|
from .subject_creationv01 import Creationv01Dataset # noqa: F401, F403
|
||||||
|
from .subject_ir import IRDataset # noqa: F401, F403
|
||||||
from .subject_multiround import MultiroundDataset # noqa: F401, F403
|
from .subject_multiround import MultiroundDataset # noqa: F401, F403
|
||||||
from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403
|
from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403
|
||||||
from .summedits import * # noqa: F401, F403
|
from .summedits import * # noqa: F401, F403
|
||||||
|
88
opencompass/datasets/subject_ir.py
Normal file
88
opencompass/datasets/subject_ir.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
# flake8: noqa: E501
|
||||||
|
import json
|
||||||
|
import os.path as osp
|
||||||
|
import re
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
|
from opencompass.registry import LOAD_DATASET
|
||||||
|
|
||||||
|
from .subjective_cmp import SubjectiveCmpDataset
|
||||||
|
|
||||||
|
eng_base_prefix = """
|
||||||
|
You are a judger. Please impartially judge whether an AI model's response to a question is correct based on the reference answers. You need to provide a conclusion of "correct" or "wrong," followed by the corresponding reasoning.
|
||||||
|
|
||||||
|
Note that since the reference answer is a candidate list, the AI model's response only needs to align with one item in the list to be deemed "correct."
|
||||||
|
|
||||||
|
Your judgment must strictly adhere to the following format:
|
||||||
|
Conclusion: [[Correct]]
|
||||||
|
Reasoning: xxx.
|
||||||
|
|
||||||
|
Conclusion: [[Wrong]]
|
||||||
|
Reasoning: xxx.
|
||||||
|
|
||||||
|
[Question Start]
|
||||||
|
{question}
|
||||||
|
[Question End]
|
||||||
|
|
||||||
|
[Reference Answers Start]
|
||||||
|
{ref}
|
||||||
|
[Reference Answers End]
|
||||||
|
|
||||||
|
[Model Response Start]
|
||||||
|
"""
|
||||||
|
|
||||||
|
chn_base_prefix = """
|
||||||
|
你是一个评判者,请你基于参考答案,公正地评判一个AI模型对于问题的回答是否正确。你需要给出“对或错”的结论,然后再给出相应的理由。
|
||||||
|
请注意,由于参考答案是一个候选列表,因此AI模型的回答只要符合列表中的某一项即可判断为“对”。
|
||||||
|
你的评判必须严格遵守以下格式:
|
||||||
|
结论:[[对]]
|
||||||
|
理由:xxx。
|
||||||
|
|
||||||
|
结论:[[错]]
|
||||||
|
理由:xxx。
|
||||||
|
|
||||||
|
[问题开始]
|
||||||
|
{question}
|
||||||
|
[问题结束]
|
||||||
|
|
||||||
|
[参考答案开始]
|
||||||
|
{ref}
|
||||||
|
[参考答案结束]
|
||||||
|
|
||||||
|
[模型回答开始]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def prompt_construct(sample):
|
||||||
|
lan = sample['others']['lan']
|
||||||
|
question = sample['question']
|
||||||
|
if lan == 'zh':
|
||||||
|
prefix = chn_base_prefix.format(question=sample['question'],
|
||||||
|
ref=str(sample['others']['answers']))
|
||||||
|
suffix = '\n[模型回答结束]\n'
|
||||||
|
elif lan == 'en':
|
||||||
|
prefix = eng_base_prefix.format(question=sample['question'],
|
||||||
|
ref=str(sample['others']['answers']))
|
||||||
|
suffix = '\n[Model Response End]\n'
|
||||||
|
return prefix, suffix
|
||||||
|
|
||||||
|
|
||||||
|
@LOAD_DATASET.register_module()
|
||||||
|
class IRDataset(SubjectiveCmpDataset):
|
||||||
|
|
||||||
|
def load(
|
||||||
|
self,
|
||||||
|
path: str,
|
||||||
|
name: str,
|
||||||
|
):
|
||||||
|
dataset = list(super().load(path, name))
|
||||||
|
subject_dataset = []
|
||||||
|
for data in dataset:
|
||||||
|
data['gpt4_prefix'], data['gpt4_suffix'] = prompt_construct(data)
|
||||||
|
data['judge']['others'] = data['others']
|
||||||
|
data['ref'] = str(data['others']['answers'])
|
||||||
|
subject_dataset.append(data)
|
||||||
|
dataset = Dataset.from_list(subject_dataset)
|
||||||
|
return dataset
|
@ -5,5 +5,6 @@ from .corev2 import Corev2Summarizer # noqa: F401
|
|||||||
from .creationbench import CreationBenchSummarizer
|
from .creationbench import CreationBenchSummarizer
|
||||||
from .creationv01 import Creationv01Summarizer # noqa: F401
|
from .creationv01 import Creationv01Summarizer # noqa: F401
|
||||||
from .default import DefaultSummarizer # noqa: F401
|
from .default import DefaultSummarizer # noqa: F401
|
||||||
|
from .information_retrival import IRSummarizer # noqa: F401
|
||||||
from .multiround import MultiroundSummarizer # noqa: F401
|
from .multiround import MultiroundSummarizer # noqa: F401
|
||||||
from .subjective import SubjectiveSummarizer # noqa: F401
|
from .subjective import SubjectiveSummarizer # noqa: F401
|
||||||
|
@ -128,7 +128,7 @@ def get_dimension_results(judged_answers, references, fout, fout_flag, model):
|
|||||||
writer = csv.writer(csvfile)
|
writer = csv.writer(csvfile)
|
||||||
if fout_flag == 0:
|
if fout_flag == 0:
|
||||||
writer.writerow(['模型'] + columns)
|
writer.writerow(['模型'] + columns)
|
||||||
fout_flag += 1
|
|
||||||
for row in rows:
|
for row in rows:
|
||||||
writer.writerow([row] +
|
writer.writerow([row] +
|
||||||
[scores[row][column] for column in columns])
|
[scores[row][column] for column in columns])
|
||||||
@ -184,7 +184,6 @@ def get_capability_results(judged_answers,
|
|||||||
sub_header.extend([category + '总分'])
|
sub_header.extend([category + '总分'])
|
||||||
sub_header.extend(sub_categories)
|
sub_header.extend(sub_categories)
|
||||||
writer.writerow(sub_header)
|
writer.writerow(sub_header)
|
||||||
fout_flag += 1
|
|
||||||
|
|
||||||
row = [model]
|
row = [model]
|
||||||
row.append(scores[model]['总分'])
|
row.append(scores[model]['总分'])
|
||||||
@ -203,7 +202,7 @@ class AlignmentBenchSummarizer:
|
|||||||
It's expected to be filled out at runtime.
|
It's expected to be filled out at runtime.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config: ConfigDict, judge_type: str) -> None:
|
def __init__(self, config: ConfigDict, judge_type='general') -> None:
|
||||||
self.tasks = []
|
self.tasks = []
|
||||||
self.cfg = config
|
self.cfg = config
|
||||||
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
||||||
@ -252,8 +251,10 @@ class AlignmentBenchSummarizer:
|
|||||||
if self.judge_type == 'general':
|
if self.judge_type == 'general':
|
||||||
get_dimension_results(judged_answers, references, fout,
|
get_dimension_results(judged_answers, references, fout,
|
||||||
fout_flag, model)
|
fout_flag, model)
|
||||||
|
fout_flag += 1
|
||||||
get_capability_results(judged_answers, references, fout2,
|
get_capability_results(judged_answers, references, fout2,
|
||||||
fout_flag2, model, self.category)
|
fout_flag2, model, self.category)
|
||||||
|
fout_flag2 += 1
|
||||||
else:
|
else:
|
||||||
print(subdir_path + ' is not exist! please check!')
|
print(subdir_path + ' is not exist! please check!')
|
||||||
if self.judge_type == 'general':
|
if self.judge_type == 'general':
|
||||||
|
138
opencompass/summarizers/information_retrival.py
Normal file
138
opencompass/summarizers/information_retrival.py
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
# flake8: noqa: E501
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import os.path as osp
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from mmengine import ConfigDict
|
||||||
|
|
||||||
|
try:
|
||||||
|
from prettytable import from_csv
|
||||||
|
except ImportError:
|
||||||
|
from_csv = None
|
||||||
|
|
||||||
|
from opencompass.utils import model_abbr_from_cfg
|
||||||
|
|
||||||
|
from .subjective_post_process import post_process_autoj
|
||||||
|
from .utils import get_judgeanswer_and_reference, get_outdir
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_ir(judgement: str):
|
||||||
|
"""Input a string like below:
|
||||||
|
|
||||||
|
Conclusion: [[Correct]]\nReasoning: xxx
|
||||||
|
and extract the score
|
||||||
|
"""
|
||||||
|
matches = re.findall(r'\[\[(.*?)\]\]', judgement)
|
||||||
|
if matches:
|
||||||
|
matches = matches[0]
|
||||||
|
if matches in ['Correct', 'Wrong', '对', '错']:
|
||||||
|
if matches == 'Correct' or matches == '对':
|
||||||
|
return {'score': 1}
|
||||||
|
else:
|
||||||
|
return {'score': 0}
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_results(
|
||||||
|
judged_answers,
|
||||||
|
references,
|
||||||
|
fout,
|
||||||
|
fout_flag,
|
||||||
|
model,
|
||||||
|
):
|
||||||
|
capability_ratings = defaultdict(int)
|
||||||
|
capability_counts = defaultdict(int)
|
||||||
|
for ans, ref in zip(judged_answers, references):
|
||||||
|
lan = ref['others']['lan']
|
||||||
|
capability_ratings['total'] += ans['score']
|
||||||
|
capability_counts['total'] += 1
|
||||||
|
capability_ratings[lan] += ans['score']
|
||||||
|
capability_counts[lan] += 1
|
||||||
|
|
||||||
|
capability_avg_ratings = defaultdict(float)
|
||||||
|
|
||||||
|
for capability, total_score in capability_ratings.items():
|
||||||
|
capability_avg_ratings[
|
||||||
|
capability] = total_score / capability_counts[capability]
|
||||||
|
|
||||||
|
scores = {model: capability_avg_ratings}
|
||||||
|
|
||||||
|
with open(fout, 'a+', newline='') as csvfile:
|
||||||
|
writer = csv.writer(csvfile)
|
||||||
|
if fout_flag == 0:
|
||||||
|
num_header = [str(i) for i in range(4)]
|
||||||
|
writer.writerow(num_header)
|
||||||
|
|
||||||
|
header = ['模型']
|
||||||
|
for category in capability_avg_ratings:
|
||||||
|
header.append(category)
|
||||||
|
writer.writerow(header)
|
||||||
|
|
||||||
|
row = [model]
|
||||||
|
for category in capability_avg_ratings:
|
||||||
|
row.append(scores[model][category])
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
|
|
||||||
|
class IRSummarizer:
|
||||||
|
"""Do the subjectivity analyze based on evaluation results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config (ConfigDict): The configuration object of the evaluation task.
|
||||||
|
It's expected to be filled out at runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: ConfigDict, judge_type='autoj') -> None:
|
||||||
|
self.tasks = []
|
||||||
|
self.cfg = config
|
||||||
|
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
||||||
|
self.eval_model_abbrs = [
|
||||||
|
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
|
||||||
|
]
|
||||||
|
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
|
||||||
|
self.judge_type = judge_type
|
||||||
|
assert self.judge_type in ['general', 'autoj']
|
||||||
|
self.judge_map = {
|
||||||
|
'general': post_process_ir,
|
||||||
|
'autoj': post_process_autoj,
|
||||||
|
}
|
||||||
|
self.judge_function = self.judge_map[self.judge_type]
|
||||||
|
|
||||||
|
def summarize(self,
|
||||||
|
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||||
|
"""Summarize the subjectivity analysis based on evaluation results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
time_str (str): Timestamp for file naming.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: The summary results.
|
||||||
|
"""
|
||||||
|
dataset_cfgs = self.cfg['datasets']
|
||||||
|
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||||
|
fout_flag = 0
|
||||||
|
for eval_model_abbr in self.eval_model_abbrs:
|
||||||
|
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
|
||||||
|
subdir_path = os.path.join(results_folder, subdir)
|
||||||
|
if os.path.isdir(subdir_path):
|
||||||
|
model, judge_model = eval_model_abbr, self.judge_abbr
|
||||||
|
fout = osp.join(output_dir,
|
||||||
|
'judged-by--' + judge_model + '.csv')
|
||||||
|
for dataset in dataset_cfgs:
|
||||||
|
judged_answers, references = get_judgeanswer_and_reference(
|
||||||
|
dataset, subdir_path, self.judge_function)
|
||||||
|
get_results(judged_answers, references, fout, fout_flag,
|
||||||
|
model)
|
||||||
|
fout_flag += 1
|
||||||
|
else:
|
||||||
|
print(subdir_path + ' is not exist! please check!')
|
||||||
|
with open(fout, 'r') as f:
|
||||||
|
x = from_csv(f)
|
||||||
|
print(x)
|
Loading…
Reference in New Issue
Block a user