OpenCompass/opencompass/datasets/subjective/writingbench.py

# flake8: noqa
import json
import os.path as osp
import re
from collections import defaultdict

from datasets import Dataset

from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path

from ..base import BaseDataset
from .utils import get_judgeanswer_and_reference

base_prompt = """Evaluate the Response based on the Query and criteria provided.

** Criteria **
```{criteria}```

** Query **
```{question}```

** Response **
```{prediction}```

Provide your evaluation based on the criteria:

```{criteria}```

Provide reasons for each score, indicating where and why any strengths or deficiencies occur within the Response. Reference specific passages or elements from the text to support your justification.
Ensure that each reason is concrete, with explicit references to the text that aligns with the criteria requirements.

Scoring Range: Assign an integer score between 1 to 10

** Output format **
Return the results in the following JSON format, Only output this JSON format and nothing else:
```json
{{
    "score": an integer score between 1 to 10,
    "reason": "Specific and detailed justification for the score using text elements."
}}
```
"""


@LOAD_DATASET.register_module()
class WritingBenchDataset(BaseDataset):

    def load(self, path: str, name: str, *args, **kwargs):
        path = get_data_path(path, local_mode=True)
        filename = osp.join(path, f'{name}.jsonl')
        raw_data = []
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                domain1 = data['domain1']
                domain2 = data['domain2']
                query = data['query']
                criteria = data['criteria']
                judge_prompt_list = []
                for criteria_item in criteria:
                    temp_prompt = base_prompt.format(question=query,
                                                     criteria=criteria_item,
                                                     prediction='{prediction}')
                    judge_prompt_list.append(temp_prompt)
                idx = data['index']
                raw_data.append({
                    'question': query,
                    'judge': {
                        'index': idx,
                        'domain1': domain1,
                        'domain2': domain2,
                        'query': query,
                        'judge_prompt_list': judge_prompt_list
                    }
                })
        dataset = Dataset.from_list(raw_data)
        return dataset


def post_process_writingbench(judgement: dict):
    """Input a string like below:

    {"score": 9, "reason": "The response provides..."}, and extract the score
    """
    match = re.search(r"[\"']score[\"']:\s*([0-9]+)", judgement['prediction'])
    if match:
        score = int(match.group(1))
    else:
        return None

    return {'score': score}


@DICT_POSTPROCESSORS.register_module('writingbench')
def writingbench_postprocess(output: dict, output_path: str) -> dict:
    judged_answers, references = get_judgeanswer_and_reference(
        output, output_path, post_process_writingbench)

    if len(judged_answers) == 0:
        scores = None

    scores = defaultdict(list)
    for ans, ref in zip(judged_answers, references):
        domain = ref['domain1']
        score = ans['score']
        if score is not None:
            scores['overall'].append(score)
            scores[domain].append(score)
    single_model_scores = {
        task: sum(score) / len(score)
        for task, score in scores.items()
    }
    results = single_model_scores
    results['details'] = output
    return results