mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
add judgebench
This commit is contained in:
commit
f6c519e283
@ -0,0 +1,69 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import WritingBenchDataset, writingbench_postprocess
|
||||
from mmengine.config import read_base
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question'],
|
||||
output_column='judge',
|
||||
)
|
||||
|
||||
subjective_all_sets = [
|
||||
'writingbench'
|
||||
]
|
||||
|
||||
writingbench_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{question}'
|
||||
),
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer,),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LMEvaluator,
|
||||
multi_eval=True,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='You are an expert evaluator with extensive experience in evaluating response of given query.')
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt = '{prediction}'
|
||||
),
|
||||
]),
|
||||
),
|
||||
dict_postprocessor=dict(type=writingbench_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
writingbench_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=WritingBenchDataset,
|
||||
path='./data/subjective/writingbench',
|
||||
name=_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='singlescore',
|
||||
))
|
@ -35,3 +35,4 @@ from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403
|
||||
from .wildbench import WildBenchDataset # noqa: F401, F403
|
||||
from .wildbench import wildbench_bradleyterry_postprocess # noqa: F401, F403
|
||||
from .wildbench import wildbench_postprocess # noqa: F401, F403
|
||||
from .writingbench import *
|
116
opencompass/datasets/subjective/writingbench.py
Normal file
116
opencompass/datasets/subjective/writingbench.py
Normal file
@ -0,0 +1,116 @@
|
||||
# flake8: noqa
|
||||
import json
|
||||
import os.path as osp
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
from ..base import BaseDataset
|
||||
from .utils import get_judgeanswer_and_reference
|
||||
|
||||
base_prompt = """Evaluate the Response based on the Query and criteria provided.
|
||||
|
||||
** Criteria **
|
||||
```{criteria}```
|
||||
|
||||
** Query **
|
||||
```{question}```
|
||||
|
||||
** Response **
|
||||
```{prediction}```
|
||||
|
||||
Provide your evaluation based on the criteria:
|
||||
|
||||
```{criteria}```
|
||||
|
||||
Provide reasons for each score, indicating where and why any strengths or deficiencies occur within the Response. Reference specific passages or elements from the text to support your justification.
|
||||
Ensure that each reason is concrete, with explicit references to the text that aligns with the criteria requirements.
|
||||
|
||||
Scoring Range: Assign an integer score between 1 to 10
|
||||
|
||||
** Output format **
|
||||
Return the results in the following JSON format, Only output this JSON format and nothing else:
|
||||
```json
|
||||
{{
|
||||
"score": an integer score between 1 to 10,
|
||||
"reason": "Specific and detailed justification for the score using text elements."
|
||||
}}
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class WritingBenchDataset(BaseDataset):
|
||||
|
||||
def load(self, path: str, name: str, *args, **kwargs):
|
||||
path = get_data_path(path, local_mode=True)
|
||||
filename = osp.join(path, f'{name}.jsonl')
|
||||
raw_data = []
|
||||
with open(filename, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
domain1 = data['domain1']
|
||||
domain2 = data['domain2']
|
||||
query = data['query']
|
||||
criteria = data['criteria']
|
||||
judge_prompt_list = []
|
||||
for criteria_item in criteria:
|
||||
temp_prompt = base_prompt.format(question=query,
|
||||
criteria=criteria_item,
|
||||
prediction='{prediction}')
|
||||
judge_prompt_list.append(temp_prompt)
|
||||
idx = data['index']
|
||||
raw_data.append({
|
||||
'question': query,
|
||||
'judge': {
|
||||
'index': idx,
|
||||
'domain1': domain1,
|
||||
'domain2': domain2,
|
||||
'query': query,
|
||||
'judge_prompt_list': judge_prompt_list
|
||||
}
|
||||
})
|
||||
dataset = Dataset.from_list(raw_data)
|
||||
return dataset
|
||||
|
||||
|
||||
def post_process_writingbench(judgement: dict):
|
||||
"""Input a string like below:
|
||||
|
||||
{"score": 9, "reason": "The response provides..."}, and extract the score
|
||||
"""
|
||||
match = re.search(r"[\"']score[\"']:\s*([0-9]+)", judgement['prediction'])
|
||||
if match:
|
||||
score = int(match.group(1))
|
||||
else:
|
||||
return None
|
||||
|
||||
return {'score': score}
|
||||
|
||||
|
||||
@DICT_POSTPROCESSORS.register_module('writingbench')
|
||||
def writingbench_postprocess(output: dict, output_path: str) -> dict:
|
||||
judged_answers, references = get_judgeanswer_and_reference(
|
||||
output, output_path, post_process_writingbench)
|
||||
|
||||
if len(judged_answers) == 0:
|
||||
scores = None
|
||||
|
||||
scores = defaultdict(list)
|
||||
for ans, ref in zip(judged_answers, references):
|
||||
domain = ref['domain1']
|
||||
score = ans['score']
|
||||
if score is not None:
|
||||
scores['overall'].append(score)
|
||||
scores[domain].append(score)
|
||||
single_model_scores = {
|
||||
task: sum(score) / len(score)
|
||||
for task, score in scores.items()
|
||||
}
|
||||
results = single_model_scores
|
||||
results['details'] = output
|
||||
return results
|
@ -116,6 +116,7 @@ class LMEvaluator:
|
||||
pred_postprocessor (ConfigDict): The model prediction's postprocessor
|
||||
config.
|
||||
keep_predictions (bool): Whether to save model predictions in references. Useful when postprocessor requires model predictions as input to calculate additional features (e.g. response length, markdown list counts, ...). Defaults to False.
|
||||
multi_eval (bool): Whether to do multiple evaluation with different prompt settings.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -129,7 +130,9 @@ class LMEvaluator:
|
||||
pred_postprocessor: Optional[ConfigDict] = None,
|
||||
dict_postprocessor: Optional[ConfigDict] = None,
|
||||
keep_predictions: bool = False,
|
||||
multi_eval: bool = False,
|
||||
) -> None:
|
||||
self.multi_eval = multi_eval
|
||||
self.output_path = output_path
|
||||
out_dir, out_name = osp.split(output_path)
|
||||
if not out_dir:
|
||||
@ -209,6 +212,33 @@ class LMEvaluator:
|
||||
references = [
|
||||
{} for _ in range(len(predictions[0]['model_preds']))
|
||||
]
|
||||
if self.multi_eval:
|
||||
assert references is not None
|
||||
assert 'judge_prompt_list' in references[0]
|
||||
self.multi_eval_times = len(references[0]['judge_prompt_list'])
|
||||
temp_predictions_save_list = []
|
||||
for idx, pred in enumerate(predictions['model_preds']):
|
||||
for judge_prompt in references[idx]['judge_prompt_list']:
|
||||
temp_prediction = judge_prompt.replace(
|
||||
'{prediction}', pred)
|
||||
temp_predictions_save_list.append(temp_prediction)
|
||||
predictions['model_preds'] = temp_predictions_save_list
|
||||
|
||||
temp_references_save_list = []
|
||||
for item in references:
|
||||
new_item = {
|
||||
key: value
|
||||
for key, value in item.items()
|
||||
if key != 'judge_prompt_list'
|
||||
}
|
||||
if 'judge_prompt_list' in item:
|
||||
for prompt in item['judge_prompt_list']:
|
||||
temp_item = new_item.copy()
|
||||
temp_item['judge_prompt'] = prompt
|
||||
temp_references_save_list.append(temp_item)
|
||||
else:
|
||||
temp_references_save_list.append(item)
|
||||
references = temp_references_save_list
|
||||
predictions = [predictions['model_preds']]
|
||||
|
||||
# Due to the rarity of identical predictions, we have temporarily disabled the plagiarism detection feature.
|
||||
@ -268,7 +298,12 @@ class LMEvaluator:
|
||||
|
||||
if self.dataset_cfg:
|
||||
dataset = build_dataset_from_cfg(self.dataset_cfg)
|
||||
|
||||
if self.multi_eval:
|
||||
new_ds = {
|
||||
k: dataset.test[k] * self.multi_eval_times
|
||||
for k in dataset.test.column_names
|
||||
}
|
||||
dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
|
||||
if infer_order == 'double':
|
||||
new_ds = {
|
||||
k: dataset.test[k] * 2
|
||||
@ -329,4 +364,4 @@ class LMEvaluator:
|
||||
else:
|
||||
kwargs = self.dict_postprocessor
|
||||
proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
|
||||
return proc(output, self.output_path, **kwargs)
|
||||
return proc(output, self.output_path, **kwargs)
|
||||
|
Loading…
Reference in New Issue
Block a user