mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] add subject ir dataset (#755)
* add subject ir * Add ir dataset * Add ir dataset
This commit is contained in:
parent
be369c3e06
commit
2163f9398f
71
configs/datasets/subjective_ir/ir_judgedby_autoj.py
Normal file
71
configs/datasets/subjective_ir/ir_judgedby_autoj.py
Normal file
@ -0,0 +1,71 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import IRDataset
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'capability', 'ref'],
|
||||
output_column='judge',
|
||||
)
|
||||
|
||||
subjective_all_sets = [
|
||||
"information_retrieval",
|
||||
]
|
||||
data_path ="data/subjective/"
|
||||
|
||||
subjective_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt="{question}"
|
||||
),
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt = """为上传的针对给定用户问题的回应撰写评论, 并为该回复打分:
|
||||
|
||||
[BEGIN DATA]
|
||||
***
|
||||
[用户问询]: {question}
|
||||
***
|
||||
[回应]: {prediction}
|
||||
***
|
||||
[参考答案]: {ref}
|
||||
***
|
||||
[END DATA]
|
||||
|
||||
请根据参考答案为这个回应撰写评论. 在这之后, 你应该按照如下格式给这个回应一个最终的1-10范围的评分: "[[评分]]", 例如: "评分: [[5]]"."""
|
||||
),
|
||||
]),
|
||||
),
|
||||
),
|
||||
pred_role="BOT",
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
dict(
|
||||
abbr=f"{_name}",
|
||||
type=IRDataset,
|
||||
path=data_path,
|
||||
name=_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
))
|
59
configs/datasets/subjective_ir/ir_judgedby_gpt4.py
Normal file
59
configs/datasets/subjective_ir/ir_judgedby_gpt4.py
Normal file
@ -0,0 +1,59 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import IRDataset
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'capability', 'gpt4_prefix', 'gpt4_suffix', 'ref'],
|
||||
output_column='judge',
|
||||
)
|
||||
|
||||
subjective_all_sets = [
|
||||
"information_retrieval",
|
||||
]
|
||||
data_path ="data/subjective/"
|
||||
|
||||
subjective_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt="{question}"
|
||||
),
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt = "{gpt4_prefix}{prediction}{gpt4_suffix}"
|
||||
),
|
||||
]),
|
||||
),
|
||||
),
|
||||
pred_role="BOT",
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
dict(
|
||||
abbr=f"{_name}",
|
||||
type=IRDataset,
|
||||
path=data_path,
|
||||
name=_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
))
|
@ -79,7 +79,7 @@ eval = dict(
|
||||
)
|
||||
|
||||
summarizer = dict(
|
||||
type=AlignmentBenchSummarizer,
|
||||
type=AlignmentBenchSummarizer, judge_type = 'general'
|
||||
)
|
||||
|
||||
work_dir = 'outputs/alignment_bench/'
|
||||
|
@ -84,6 +84,7 @@ from .subject_alignmentbench import AlignmentBenchDataset # noqa: F401, F403
|
||||
from .subject_corev2 import Corev2Dataset # noqa: F401, F403
|
||||
from .subject_creationbench import CreationBenchDataset # noqa: F401, F403
|
||||
from .subject_creationv01 import Creationv01Dataset # noqa: F401, F403
|
||||
from .subject_ir import IRDataset # noqa: F401, F403
|
||||
from .subject_multiround import MultiroundDataset # noqa: F401, F403
|
||||
from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403
|
||||
from .summedits import * # noqa: F401, F403
|
||||
|
88
opencompass/datasets/subject_ir.py
Normal file
88
opencompass/datasets/subject_ir.py
Normal file
@ -0,0 +1,88 @@
|
||||
# flake8: noqa: E501
|
||||
import json
|
||||
import os.path as osp
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .subjective_cmp import SubjectiveCmpDataset
|
||||
|
||||
eng_base_prefix = """
|
||||
You are a judger. Please impartially judge whether an AI model's response to a question is correct based on the reference answers. You need to provide a conclusion of "correct" or "wrong," followed by the corresponding reasoning.
|
||||
|
||||
Note that since the reference answer is a candidate list, the AI model's response only needs to align with one item in the list to be deemed "correct."
|
||||
|
||||
Your judgment must strictly adhere to the following format:
|
||||
Conclusion: [[Correct]]
|
||||
Reasoning: xxx.
|
||||
|
||||
Conclusion: [[Wrong]]
|
||||
Reasoning: xxx.
|
||||
|
||||
[Question Start]
|
||||
{question}
|
||||
[Question End]
|
||||
|
||||
[Reference Answers Start]
|
||||
{ref}
|
||||
[Reference Answers End]
|
||||
|
||||
[Model Response Start]
|
||||
"""
|
||||
|
||||
chn_base_prefix = """
|
||||
你是一个评判者,请你基于参考答案,公正地评判一个AI模型对于问题的回答是否正确。你需要给出“对或错”的结论,然后再给出相应的理由。
|
||||
请注意,由于参考答案是一个候选列表,因此AI模型的回答只要符合列表中的某一项即可判断为“对”。
|
||||
你的评判必须严格遵守以下格式:
|
||||
结论:[[对]]
|
||||
理由:xxx。
|
||||
|
||||
结论:[[错]]
|
||||
理由:xxx。
|
||||
|
||||
[问题开始]
|
||||
{question}
|
||||
[问题结束]
|
||||
|
||||
[参考答案开始]
|
||||
{ref}
|
||||
[参考答案结束]
|
||||
|
||||
[模型回答开始]
|
||||
"""
|
||||
|
||||
|
||||
def prompt_construct(sample):
|
||||
lan = sample['others']['lan']
|
||||
question = sample['question']
|
||||
if lan == 'zh':
|
||||
prefix = chn_base_prefix.format(question=sample['question'],
|
||||
ref=str(sample['others']['answers']))
|
||||
suffix = '\n[模型回答结束]\n'
|
||||
elif lan == 'en':
|
||||
prefix = eng_base_prefix.format(question=sample['question'],
|
||||
ref=str(sample['others']['answers']))
|
||||
suffix = '\n[Model Response End]\n'
|
||||
return prefix, suffix
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class IRDataset(SubjectiveCmpDataset):
|
||||
|
||||
def load(
|
||||
self,
|
||||
path: str,
|
||||
name: str,
|
||||
):
|
||||
dataset = list(super().load(path, name))
|
||||
subject_dataset = []
|
||||
for data in dataset:
|
||||
data['gpt4_prefix'], data['gpt4_suffix'] = prompt_construct(data)
|
||||
data['judge']['others'] = data['others']
|
||||
data['ref'] = str(data['others']['answers'])
|
||||
subject_dataset.append(data)
|
||||
dataset = Dataset.from_list(subject_dataset)
|
||||
return dataset
|
@ -5,5 +5,6 @@ from .corev2 import Corev2Summarizer # noqa: F401
|
||||
from .creationbench import CreationBenchSummarizer
|
||||
from .creationv01 import Creationv01Summarizer # noqa: F401
|
||||
from .default import DefaultSummarizer # noqa: F401
|
||||
from .information_retrival import IRSummarizer # noqa: F401
|
||||
from .multiround import MultiroundSummarizer # noqa: F401
|
||||
from .subjective import SubjectiveSummarizer # noqa: F401
|
||||
|
@ -128,7 +128,7 @@ def get_dimension_results(judged_answers, references, fout, fout_flag, model):
|
||||
writer = csv.writer(csvfile)
|
||||
if fout_flag == 0:
|
||||
writer.writerow(['模型'] + columns)
|
||||
fout_flag += 1
|
||||
|
||||
for row in rows:
|
||||
writer.writerow([row] +
|
||||
[scores[row][column] for column in columns])
|
||||
@ -184,7 +184,6 @@ def get_capability_results(judged_answers,
|
||||
sub_header.extend([category + '总分'])
|
||||
sub_header.extend(sub_categories)
|
||||
writer.writerow(sub_header)
|
||||
fout_flag += 1
|
||||
|
||||
row = [model]
|
||||
row.append(scores[model]['总分'])
|
||||
@ -203,7 +202,7 @@ class AlignmentBenchSummarizer:
|
||||
It's expected to be filled out at runtime.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ConfigDict, judge_type: str) -> None:
|
||||
def __init__(self, config: ConfigDict, judge_type='general') -> None:
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
||||
@ -252,8 +251,10 @@ class AlignmentBenchSummarizer:
|
||||
if self.judge_type == 'general':
|
||||
get_dimension_results(judged_answers, references, fout,
|
||||
fout_flag, model)
|
||||
fout_flag += 1
|
||||
get_capability_results(judged_answers, references, fout2,
|
||||
fout_flag2, model, self.category)
|
||||
fout_flag2 += 1
|
||||
else:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
if self.judge_type == 'general':
|
||||
|
138
opencompass/summarizers/information_retrival.py
Normal file
138
opencompass/summarizers/information_retrival.py
Normal file
@ -0,0 +1,138 @@
|
||||
# flake8: noqa: E501
|
||||
import csv
|
||||
import os
|
||||
import os.path as osp
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
from mmengine import ConfigDict
|
||||
|
||||
try:
|
||||
from prettytable import from_csv
|
||||
except ImportError:
|
||||
from_csv = None
|
||||
|
||||
from opencompass.utils import model_abbr_from_cfg
|
||||
|
||||
from .subjective_post_process import post_process_autoj
|
||||
from .utils import get_judgeanswer_and_reference, get_outdir
|
||||
|
||||
|
||||
def post_process_ir(judgement: str):
|
||||
"""Input a string like below:
|
||||
|
||||
Conclusion: [[Correct]]\nReasoning: xxx
|
||||
and extract the score
|
||||
"""
|
||||
matches = re.findall(r'\[\[(.*?)\]\]', judgement)
|
||||
if matches:
|
||||
matches = matches[0]
|
||||
if matches in ['Correct', 'Wrong', '对', '错']:
|
||||
if matches == 'Correct' or matches == '对':
|
||||
return {'score': 1}
|
||||
else:
|
||||
return {'score': 0}
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def get_results(
|
||||
judged_answers,
|
||||
references,
|
||||
fout,
|
||||
fout_flag,
|
||||
model,
|
||||
):
|
||||
capability_ratings = defaultdict(int)
|
||||
capability_counts = defaultdict(int)
|
||||
for ans, ref in zip(judged_answers, references):
|
||||
lan = ref['others']['lan']
|
||||
capability_ratings['total'] += ans['score']
|
||||
capability_counts['total'] += 1
|
||||
capability_ratings[lan] += ans['score']
|
||||
capability_counts[lan] += 1
|
||||
|
||||
capability_avg_ratings = defaultdict(float)
|
||||
|
||||
for capability, total_score in capability_ratings.items():
|
||||
capability_avg_ratings[
|
||||
capability] = total_score / capability_counts[capability]
|
||||
|
||||
scores = {model: capability_avg_ratings}
|
||||
|
||||
with open(fout, 'a+', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
if fout_flag == 0:
|
||||
num_header = [str(i) for i in range(4)]
|
||||
writer.writerow(num_header)
|
||||
|
||||
header = ['模型']
|
||||
for category in capability_avg_ratings:
|
||||
header.append(category)
|
||||
writer.writerow(header)
|
||||
|
||||
row = [model]
|
||||
for category in capability_avg_ratings:
|
||||
row.append(scores[model][category])
|
||||
writer.writerow(row)
|
||||
|
||||
|
||||
class IRSummarizer:
|
||||
"""Do the subjectivity analyze based on evaluation results.
|
||||
|
||||
Args:
|
||||
config (ConfigDict): The configuration object of the evaluation task.
|
||||
It's expected to be filled out at runtime.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ConfigDict, judge_type='autoj') -> None:
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
||||
self.eval_model_abbrs = [
|
||||
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
|
||||
]
|
||||
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
|
||||
self.judge_type = judge_type
|
||||
assert self.judge_type in ['general', 'autoj']
|
||||
self.judge_map = {
|
||||
'general': post_process_ir,
|
||||
'autoj': post_process_autoj,
|
||||
}
|
||||
self.judge_function = self.judge_map[self.judge_type]
|
||||
|
||||
def summarize(self,
|
||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||
"""Summarize the subjectivity analysis based on evaluation results.
|
||||
|
||||
Args:
|
||||
time_str (str): Timestamp for file naming.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: The summary results.
|
||||
"""
|
||||
dataset_cfgs = self.cfg['datasets']
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
fout_flag = 0
|
||||
for eval_model_abbr in self.eval_model_abbrs:
|
||||
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
|
||||
subdir_path = os.path.join(results_folder, subdir)
|
||||
if os.path.isdir(subdir_path):
|
||||
model, judge_model = eval_model_abbr, self.judge_abbr
|
||||
fout = osp.join(output_dir,
|
||||
'judged-by--' + judge_model + '.csv')
|
||||
for dataset in dataset_cfgs:
|
||||
judged_answers, references = get_judgeanswer_and_reference(
|
||||
dataset, subdir_path, self.judge_function)
|
||||
get_results(judged_answers, references, fout, fout_flag,
|
||||
model)
|
||||
fout_flag += 1
|
||||
else:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
with open(fout, 'r') as f:
|
||||
x = from_csv(f)
|
||||
print(x)
|
Loading…
Reference in New Issue
Block a user