mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* fix pip version * fix pip version * add writingbench * add writingbench * add writingbench * add writingbench
117 lines
3.6 KiB
Python
117 lines
3.6 KiB
Python
# flake8: noqa
|
|
import json
|
|
import os.path as osp
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
from datasets import Dataset
|
|
|
|
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
|
|
from opencompass.utils import get_data_path
|
|
|
|
from ..base import BaseDataset
|
|
from .utils import get_judgeanswer_and_reference
|
|
|
|
base_prompt = """Evaluate the Response based on the Query and criteria provided.
|
|
|
|
** Criteria **
|
|
```{criteria}```
|
|
|
|
** Query **
|
|
```{question}```
|
|
|
|
** Response **
|
|
```{prediction}```
|
|
|
|
Provide your evaluation based on the criteria:
|
|
|
|
```{criteria}```
|
|
|
|
Provide reasons for each score, indicating where and why any strengths or deficiencies occur within the Response. Reference specific passages or elements from the text to support your justification.
|
|
Ensure that each reason is concrete, with explicit references to the text that aligns with the criteria requirements.
|
|
|
|
Scoring Range: Assign an integer score between 1 to 10
|
|
|
|
** Output format **
|
|
Return the results in the following JSON format, Only output this JSON format and nothing else:
|
|
```json
|
|
{{
|
|
"score": an integer score between 1 to 10,
|
|
"reason": "Specific and detailed justification for the score using text elements."
|
|
}}
|
|
```
|
|
"""
|
|
|
|
|
|
@LOAD_DATASET.register_module()
|
|
class WritingBenchDataset(BaseDataset):
|
|
|
|
def load(self, path: str, name: str, *args, **kwargs):
|
|
path = get_data_path(path, local_mode=True)
|
|
filename = osp.join(path, f'{name}.jsonl')
|
|
raw_data = []
|
|
with open(filename, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
data = json.loads(line)
|
|
domain1 = data['domain1']
|
|
domain2 = data['domain2']
|
|
query = data['query']
|
|
criteria = data['criteria']
|
|
judge_prompt_list = []
|
|
for criteria_item in criteria:
|
|
temp_prompt = base_prompt.format(question=query,
|
|
criteria=criteria_item,
|
|
prediction='{prediction}')
|
|
judge_prompt_list.append(temp_prompt)
|
|
idx = data['index']
|
|
raw_data.append({
|
|
'question': query,
|
|
'judge': {
|
|
'index': idx,
|
|
'domain1': domain1,
|
|
'domain2': domain2,
|
|
'query': query,
|
|
'judge_prompt_list': judge_prompt_list
|
|
}
|
|
})
|
|
dataset = Dataset.from_list(raw_data)
|
|
return dataset
|
|
|
|
|
|
def post_process_writingbench(judgement: dict):
|
|
"""Input a string like below:
|
|
|
|
{"score": 9, "reason": "The response provides..."}, and extract the score
|
|
"""
|
|
match = re.search(r"[\"']score[\"']:\s*([0-9]+)", judgement['prediction'])
|
|
if match:
|
|
score = int(match.group(1))
|
|
else:
|
|
return None
|
|
|
|
return {'score': score}
|
|
|
|
|
|
@DICT_POSTPROCESSORS.register_module('writingbench')
|
|
def writingbench_postprocess(output: dict, output_path: str) -> dict:
|
|
judged_answers, references = get_judgeanswer_and_reference(
|
|
output, output_path, post_process_writingbench)
|
|
|
|
if len(judged_answers) == 0:
|
|
scores = None
|
|
|
|
scores = defaultdict(list)
|
|
for ans, ref in zip(judged_answers, references):
|
|
domain = ref['domain1']
|
|
score = ans['score']
|
|
if score is not None:
|
|
scores['overall'].append(score)
|
|
scores[domain].append(score)
|
|
single_model_scores = {
|
|
task: sum(score) / len(score)
|
|
for task, score in scores.items()
|
|
}
|
|
results = single_model_scores
|
|
results['details'] = output
|
|
return results
|