OpenCompass/opencompass/datasets/subjective/writingbench.py
bittersweet1999 527a80947b
[Add] Add writingbench (#2028)
* fix pip version

* fix pip version

* add writingbench

* add writingbench

* add writingbench

* add writingbench
2025-04-29 16:29:32 +08:00

117 lines
3.6 KiB
Python

# flake8: noqa
import json
import os.path as osp
import re
from collections import defaultdict
from datasets import Dataset
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path
from ..base import BaseDataset
from .utils import get_judgeanswer_and_reference
base_prompt = """Evaluate the Response based on the Query and criteria provided.
** Criteria **
```{criteria}```
** Query **
```{question}```
** Response **
```{prediction}```
Provide your evaluation based on the criteria:
```{criteria}```
Provide reasons for each score, indicating where and why any strengths or deficiencies occur within the Response. Reference specific passages or elements from the text to support your justification.
Ensure that each reason is concrete, with explicit references to the text that aligns with the criteria requirements.
Scoring Range: Assign an integer score between 1 to 10
** Output format **
Return the results in the following JSON format, Only output this JSON format and nothing else:
```json
{{
"score": an integer score between 1 to 10,
"reason": "Specific and detailed justification for the score using text elements."
}}
```
"""
@LOAD_DATASET.register_module()
class WritingBenchDataset(BaseDataset):
def load(self, path: str, name: str, *args, **kwargs):
path = get_data_path(path, local_mode=True)
filename = osp.join(path, f'{name}.jsonl')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
domain1 = data['domain1']
domain2 = data['domain2']
query = data['query']
criteria = data['criteria']
judge_prompt_list = []
for criteria_item in criteria:
temp_prompt = base_prompt.format(question=query,
criteria=criteria_item,
prediction='{prediction}')
judge_prompt_list.append(temp_prompt)
idx = data['index']
raw_data.append({
'question': query,
'judge': {
'index': idx,
'domain1': domain1,
'domain2': domain2,
'query': query,
'judge_prompt_list': judge_prompt_list
}
})
dataset = Dataset.from_list(raw_data)
return dataset
def post_process_writingbench(judgement: dict):
"""Input a string like below:
{"score": 9, "reason": "The response provides..."}, and extract the score
"""
match = re.search(r"[\"']score[\"']:\s*([0-9]+)", judgement['prediction'])
if match:
score = int(match.group(1))
else:
return None
return {'score': score}
@DICT_POSTPROCESSORS.register_module('writingbench')
def writingbench_postprocess(output: dict, output_path: str) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
output, output_path, post_process_writingbench)
if len(judged_answers) == 0:
scores = None
scores = defaultdict(list)
for ans, ref in zip(judged_answers, references):
domain = ref['domain1']
score = ans['score']
if score is not None:
scores['overall'].append(score)
scores[domain].append(score)
single_model_scores = {
task: sum(score) / len(score)
for task, score in scores.items()
}
results = single_model_scores
results['details'] = output
return results