mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
add writingbench
This commit is contained in:
parent
5fee3b237a
commit
b93afe7764
@ -0,0 +1,69 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||||
|
from opencompass.datasets import WritingBenchDataset, writingbench_postprocess
|
||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
subjective_reader_cfg = dict(
|
||||||
|
input_columns=['question'],
|
||||||
|
output_column='judge',
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_all_sets = [
|
||||||
|
'writingbench'
|
||||||
|
]
|
||||||
|
|
||||||
|
writingbench_datasets = []
|
||||||
|
|
||||||
|
for _name in subjective_all_sets:
|
||||||
|
subjective_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt='{question}'
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer,),
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LMEvaluator,
|
||||||
|
multi_eval=True,
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
begin=[
|
||||||
|
dict(
|
||||||
|
role='SYSTEM',
|
||||||
|
fallback_role='HUMAN',
|
||||||
|
prompt="You are an expert evaluator with extensive experience in evaluating response of given query.")
|
||||||
|
],
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt = '{prediction}'
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
dict_postprocessor=dict(type=writingbench_postprocess),
|
||||||
|
),
|
||||||
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
|
writingbench_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f'{_name}',
|
||||||
|
type=WritingBenchDataset,
|
||||||
|
path='./data/subjective/writingbench',
|
||||||
|
name=_name,
|
||||||
|
reader_cfg=subjective_reader_cfg,
|
||||||
|
infer_cfg=subjective_infer_cfg,
|
||||||
|
eval_cfg=subjective_eval_cfg,
|
||||||
|
mode='singlescore',
|
||||||
|
))
|
@ -35,3 +35,4 @@ from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403
|
|||||||
from .wildbench import WildBenchDataset # noqa: F401, F403
|
from .wildbench import WildBenchDataset # noqa: F401, F403
|
||||||
from .wildbench import wildbench_bradleyterry_postprocess # noqa: F401, F403
|
from .wildbench import wildbench_bradleyterry_postprocess # noqa: F401, F403
|
||||||
from .wildbench import wildbench_postprocess # noqa: F401, F403
|
from .wildbench import wildbench_postprocess # noqa: F401, F403
|
||||||
|
from .writingbench import *
|
114
opencompass/datasets/subjective/writingbench.py
Normal file
114
opencompass/datasets/subjective/writingbench.py
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
# flake8: noqa
|
||||||
|
import json
|
||||||
|
import os.path as osp
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from datasets import Dataset
|
||||||
|
|
||||||
|
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
|
||||||
|
from opencompass.utils import get_data_path
|
||||||
|
|
||||||
|
from ..base import BaseDataset
|
||||||
|
from .utils import get_judgeanswer_and_reference
|
||||||
|
|
||||||
|
|
||||||
|
base_prompt = """Evaluate the Response based on the Query and criteria provided.
|
||||||
|
|
||||||
|
** Criteria **
|
||||||
|
```{criteria}```
|
||||||
|
|
||||||
|
** Query **
|
||||||
|
```{question}```
|
||||||
|
|
||||||
|
** Response **
|
||||||
|
```{prediction}```
|
||||||
|
|
||||||
|
Provide your evaluation based on the criteria:
|
||||||
|
|
||||||
|
```{criteria}```
|
||||||
|
|
||||||
|
Provide reasons for each score, indicating where and why any strengths or deficiencies occur within the Response. Reference specific passages or elements from the text to support your justification.
|
||||||
|
Ensure that each reason is concrete, with explicit references to the text that aligns with the criteria requirements.
|
||||||
|
|
||||||
|
Scoring Range: Assign an integer score between 1 to 10
|
||||||
|
|
||||||
|
** Output format **
|
||||||
|
Return the results in the following JSON format, Only output this JSON format and nothing else:
|
||||||
|
```json
|
||||||
|
{{
|
||||||
|
"score": an integer score between 1 to 10,
|
||||||
|
"reason": "Specific and detailed justification for the score using text elements."
|
||||||
|
}}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
@LOAD_DATASET.register_module()
|
||||||
|
class WritingBenchDataset(BaseDataset):
|
||||||
|
|
||||||
|
def load(self, path: str, name: str, *args, **kwargs):
|
||||||
|
path = get_data_path(path, local_mode=True)
|
||||||
|
filename = osp.join(path, f'{name}.jsonl')
|
||||||
|
raw_data = []
|
||||||
|
with open(filename, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
data = json.loads(line)
|
||||||
|
domain1 = data['domain1']
|
||||||
|
domain2 = data['domain2']
|
||||||
|
query = data['query']
|
||||||
|
criteria = data['criteria']
|
||||||
|
judge_prompt_list = []
|
||||||
|
for criteria_item in criteria:
|
||||||
|
temp_prompt = base_prompt.format(question=query, criteria=criteria_item, prediction='{prediction}')
|
||||||
|
judge_prompt_list.append(temp_prompt)
|
||||||
|
idx = data['index']
|
||||||
|
raw_data.append({
|
||||||
|
'question': query,
|
||||||
|
'judge': {
|
||||||
|
'index': idx,
|
||||||
|
'domain1': domain1,
|
||||||
|
'domain2': domain2,
|
||||||
|
'query': query,
|
||||||
|
'judge_prompt_list': judge_prompt_list
|
||||||
|
}
|
||||||
|
})
|
||||||
|
dataset = Dataset.from_list(raw_data)
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_writingbench(judgement: dict):
|
||||||
|
"""Input a string like below:
|
||||||
|
|
||||||
|
{"score": 9, "reason": "The response provides..."}, and extract the score
|
||||||
|
"""
|
||||||
|
match = re.search(r"[\"']score[\"']:\s*([0-9]+)", judgement['prediction'])
|
||||||
|
if match:
|
||||||
|
score = int(match.group(1))
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return {'score': score}
|
||||||
|
|
||||||
|
|
||||||
|
@DICT_POSTPROCESSORS.register_module('writingbench')
|
||||||
|
def writingbench_postprocess(output: dict, output_path: str) -> dict:
|
||||||
|
judged_answers, references = get_judgeanswer_and_reference(
|
||||||
|
output, output_path, post_process_writingbench)
|
||||||
|
|
||||||
|
if len(judged_answers) == 0:
|
||||||
|
scores = None
|
||||||
|
|
||||||
|
scores = defaultdict(list)
|
||||||
|
for ans, ref in zip(judged_answers, references):
|
||||||
|
domain = ref['domain1']
|
||||||
|
score = ans['score']
|
||||||
|
if score is not None:
|
||||||
|
scores['overall'].append(score)
|
||||||
|
scores[domain].append(score)
|
||||||
|
single_model_scores = {
|
||||||
|
task: sum(score) / len(score)
|
||||||
|
for task, score in scores.items()
|
||||||
|
}
|
||||||
|
results = single_model_scores
|
||||||
|
results['details'] = output
|
||||||
|
return results
|
@ -129,7 +129,9 @@ class LMEvaluator:
|
|||||||
pred_postprocessor: Optional[ConfigDict] = None,
|
pred_postprocessor: Optional[ConfigDict] = None,
|
||||||
dict_postprocessor: Optional[ConfigDict] = None,
|
dict_postprocessor: Optional[ConfigDict] = None,
|
||||||
keep_predictions: bool = False,
|
keep_predictions: bool = False,
|
||||||
|
multi_eval=False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
self.multi_eval = multi_eval
|
||||||
self.output_path = output_path
|
self.output_path = output_path
|
||||||
out_dir, out_name = osp.split(output_path)
|
out_dir, out_name = osp.split(output_path)
|
||||||
if not out_dir:
|
if not out_dir:
|
||||||
@ -209,6 +211,33 @@ class LMEvaluator:
|
|||||||
references = [
|
references = [
|
||||||
{} for _ in range(len(predictions[0]['model_preds']))
|
{} for _ in range(len(predictions[0]['model_preds']))
|
||||||
]
|
]
|
||||||
|
if self.multi_eval:
|
||||||
|
assert references is not None
|
||||||
|
assert 'judge_prompt_list' in references[0]
|
||||||
|
self.multi_eval_times = len(references[0]['judge_prompt_list'])
|
||||||
|
temp_predictions_save_list = []
|
||||||
|
for idx, pred in enumerate(predictions['model_preds']):
|
||||||
|
for judge_prompt in references[idx]['judge_prompt_list']:
|
||||||
|
temp_prediction = judge_prompt.replace(
|
||||||
|
'{prediction}', pred)
|
||||||
|
temp_predictions_save_list.append(temp_prediction)
|
||||||
|
predictions['model_preds'] = temp_predictions_save_list
|
||||||
|
|
||||||
|
temp_references_save_list = []
|
||||||
|
for item in references:
|
||||||
|
new_item = {
|
||||||
|
key: value
|
||||||
|
for key, value in item.items()
|
||||||
|
if key != 'judge_prompt_list'
|
||||||
|
}
|
||||||
|
if 'judge_prompt_list' in item:
|
||||||
|
for prompt in item['judge_prompt_list']:
|
||||||
|
temp_item = new_item.copy()
|
||||||
|
temp_item['judge_prompt'] = prompt
|
||||||
|
temp_references_save_list.append(temp_item)
|
||||||
|
else:
|
||||||
|
temp_references_save_list.append(item)
|
||||||
|
references = temp_references_save_list
|
||||||
predictions = [predictions['model_preds']]
|
predictions = [predictions['model_preds']]
|
||||||
|
|
||||||
# Due to the rarity of identical predictions, we have temporarily disabled the plagiarism detection feature.
|
# Due to the rarity of identical predictions, we have temporarily disabled the plagiarism detection feature.
|
||||||
@ -268,7 +297,12 @@ class LMEvaluator:
|
|||||||
|
|
||||||
if self.dataset_cfg:
|
if self.dataset_cfg:
|
||||||
dataset = build_dataset_from_cfg(self.dataset_cfg)
|
dataset = build_dataset_from_cfg(self.dataset_cfg)
|
||||||
|
if self.multi_eval:
|
||||||
|
new_ds = {
|
||||||
|
k: dataset.test[k] * self.multi_eval_times
|
||||||
|
for k in dataset.test.column_names
|
||||||
|
}
|
||||||
|
dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
|
||||||
if infer_order == 'double':
|
if infer_order == 'double':
|
||||||
new_ds = {
|
new_ds = {
|
||||||
k: dataset.test[k] * 2
|
k: dataset.test[k] * 2
|
||||||
|
Loading…
Reference in New Issue
Block a user