add writingbench

This commit is contained in:
bittersweet1999 2025-04-18 09:21:01 +00:00
parent 5fee3b237a
commit b93afe7764
4 changed files with 220 additions and 2 deletions

View File

@ -0,0 +1,69 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WritingBenchDataset, writingbench_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'writingbench'
]
writingbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
multi_eval=True,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are an expert evaluator with extensive experience in evaluating response of given query.")
],
round=[
dict(
role='HUMAN',
prompt = '{prediction}'
),
]),
),
dict_postprocessor=dict(type=writingbench_postprocess),
),
pred_role='BOT',
)
writingbench_datasets.append(
dict(
abbr=f'{_name}',
type=WritingBenchDataset,
path='./data/subjective/writingbench',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -35,3 +35,4 @@ from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403
from .wildbench import WildBenchDataset # noqa: F401, F403
from .wildbench import wildbench_bradleyterry_postprocess # noqa: F401, F403
from .wildbench import wildbench_postprocess # noqa: F401, F403
from .writingbench import *

View File

@ -0,0 +1,114 @@
# flake8: noqa
import json
import os.path as osp
import re
from collections import defaultdict
from datasets import Dataset
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path
from ..base import BaseDataset
from .utils import get_judgeanswer_and_reference
base_prompt = """Evaluate the Response based on the Query and criteria provided.
** Criteria **
```{criteria}```
** Query **
```{question}```
** Response **
```{prediction}```
Provide your evaluation based on the criteria:
```{criteria}```
Provide reasons for each score, indicating where and why any strengths or deficiencies occur within the Response. Reference specific passages or elements from the text to support your justification.
Ensure that each reason is concrete, with explicit references to the text that aligns with the criteria requirements.
Scoring Range: Assign an integer score between 1 to 10
** Output format **
Return the results in the following JSON format, Only output this JSON format and nothing else:
```json
{{
"score": an integer score between 1 to 10,
"reason": "Specific and detailed justification for the score using text elements."
}}
```
"""
@LOAD_DATASET.register_module()
class WritingBenchDataset(BaseDataset):
def load(self, path: str, name: str, *args, **kwargs):
path = get_data_path(path, local_mode=True)
filename = osp.join(path, f'{name}.jsonl')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
domain1 = data['domain1']
domain2 = data['domain2']
query = data['query']
criteria = data['criteria']
judge_prompt_list = []
for criteria_item in criteria:
temp_prompt = base_prompt.format(question=query, criteria=criteria_item, prediction='{prediction}')
judge_prompt_list.append(temp_prompt)
idx = data['index']
raw_data.append({
'question': query,
'judge': {
'index': idx,
'domain1': domain1,
'domain2': domain2,
'query': query,
'judge_prompt_list': judge_prompt_list
}
})
dataset = Dataset.from_list(raw_data)
return dataset
def post_process_writingbench(judgement: dict):
"""Input a string like below:
{"score": 9, "reason": "The response provides..."}, and extract the score
"""
match = re.search(r"[\"']score[\"']:\s*([0-9]+)", judgement['prediction'])
if match:
score = int(match.group(1))
else:
return None
return {'score': score}
@DICT_POSTPROCESSORS.register_module('writingbench')
def writingbench_postprocess(output: dict, output_path: str) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
output, output_path, post_process_writingbench)
if len(judged_answers) == 0:
scores = None
scores = defaultdict(list)
for ans, ref in zip(judged_answers, references):
domain = ref['domain1']
score = ans['score']
if score is not None:
scores['overall'].append(score)
scores[domain].append(score)
single_model_scores = {
task: sum(score) / len(score)
for task, score in scores.items()
}
results = single_model_scores
results['details'] = output
return results

View File

@ -129,7 +129,9 @@ class LMEvaluator:
pred_postprocessor: Optional[ConfigDict] = None,
dict_postprocessor: Optional[ConfigDict] = None,
keep_predictions: bool = False,
multi_eval=False,
) -> None:
self.multi_eval = multi_eval
self.output_path = output_path
out_dir, out_name = osp.split(output_path)
if not out_dir:
@ -209,6 +211,33 @@ class LMEvaluator:
references = [
{} for _ in range(len(predictions[0]['model_preds']))
]
if self.multi_eval:
assert references is not None
assert 'judge_prompt_list' in references[0]
self.multi_eval_times = len(references[0]['judge_prompt_list'])
temp_predictions_save_list = []
for idx, pred in enumerate(predictions['model_preds']):
for judge_prompt in references[idx]['judge_prompt_list']:
temp_prediction = judge_prompt.replace(
'{prediction}', pred)
temp_predictions_save_list.append(temp_prediction)
predictions['model_preds'] = temp_predictions_save_list
temp_references_save_list = []
for item in references:
new_item = {
key: value
for key, value in item.items()
if key != 'judge_prompt_list'
}
if 'judge_prompt_list' in item:
for prompt in item['judge_prompt_list']:
temp_item = new_item.copy()
temp_item['judge_prompt'] = prompt
temp_references_save_list.append(temp_item)
else:
temp_references_save_list.append(item)
references = temp_references_save_list
predictions = [predictions['model_preds']]
# Due to the rarity of identical predictions, we have temporarily disabled the plagiarism detection feature.
@ -268,7 +297,12 @@ class LMEvaluator:
if self.dataset_cfg:
dataset = build_dataset_from_cfg(self.dataset_cfg)
if self.multi_eval:
new_ds = {
k: dataset.test[k] * self.multi_eval_times
for k in dataset.test.column_names
}
dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
if infer_order == 'double':
new_ds = {
k: dataset.test[k] * 2
@ -329,4 +363,4 @@ class LMEvaluator:
else:
kwargs = self.dict_postprocessor
proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
return proc(output, self.output_path, **kwargs)
return proc(output, self.output_path, **kwargs)