add judgebench

2025-05-30 16:03:24 +08:00 · 2025-04-30 06:34:00 +00:00 · 2025-04-30 06:34:00 +00:00 · f6c519e283
commit f6c519e283
parent ea413544e2 527a80947b
4 changed files with 223 additions and 2 deletions
--- a/opencompass/configs/datasets/subjective/writingbench/writingbench_judge.py
+++ b/opencompass/configs/datasets/subjective/writingbench/writingbench_judge.py
@ -0,0 +1,69 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import WritingBenchDataset, writingbench_postprocess
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'writingbench'
+]
+
+writingbench_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer,),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            multi_eval=True,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt='You are an expert evaluator with extensive experience in evaluating response of given query.')
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{prediction}'
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=writingbench_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    writingbench_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=WritingBenchDataset,
+            path='./data/subjective/writingbench',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/opencompass/datasets/subjective/init.py
+++ b/opencompass/datasets/subjective/init.py
@ -35,3 +35,4 @@ from .subjective_cmp import SubjectiveCmpDataset  # noqa: F401, F403
 from .wildbench import WildBenchDataset  # noqa: F401, F403
 from .wildbench import wildbench_bradleyterry_postprocess  # noqa: F401, F403
 from .wildbench import wildbench_postprocess  # noqa: F401, F403
+from .writingbench import *
--- a/opencompass/datasets/subjective/writingbench.py
+++ b/opencompass/datasets/subjective/writingbench.py
@ -0,0 +1,116 @@
+# flake8: noqa
+import json
+import os.path as osp
+import re
+from collections import defaultdict
+
+from datasets import Dataset
+
+from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
+from opencompass.utils import get_data_path
+
+from ..base import BaseDataset
+from .utils import get_judgeanswer_and_reference
+
+base_prompt = """Evaluate the Response based on the Query and criteria provided.
+
+** Criteria **
+```{criteria}```
+
+** Query **
+```{question}```
+
+** Response **
+```{prediction}```
+
+Provide your evaluation based on the criteria:
+
+```{criteria}```
+
+Provide reasons for each score, indicating where and why any strengths or deficiencies occur within the Response. Reference specific passages or elements from the text to support your justification.
+Ensure that each reason is concrete, with explicit references to the text that aligns with the criteria requirements.
+
+Scoring Range: Assign an integer score between 1 to 10
+
+** Output format **
+Return the results in the following JSON format, Only output this JSON format and nothing else:
+```json
+{{
+    "score": an integer score between 1 to 10,
+    "reason": "Specific and detailed justification for the score using text elements."
+}}
+```
+"""
+
+
+@LOAD_DATASET.register_module()
+class WritingBenchDataset(BaseDataset):
+
+    def load(self, path: str, name: str, *args, **kwargs):
+        path = get_data_path(path, local_mode=True)
+        filename = osp.join(path, f'{name}.jsonl')
+        raw_data = []
+        with open(filename, 'r', encoding='utf-8') as f:
+            for line in f:
+                data = json.loads(line)
+                domain1 = data['domain1']
+                domain2 = data['domain2']
+                query = data['query']
+                criteria = data['criteria']
+                judge_prompt_list = []
+                for criteria_item in criteria:
+                    temp_prompt = base_prompt.format(question=query,
+                                                     criteria=criteria_item,
+                                                     prediction='{prediction}')
+                    judge_prompt_list.append(temp_prompt)
+                idx = data['index']
+                raw_data.append({
+                    'question': query,
+                    'judge': {
+                        'index': idx,
+                        'domain1': domain1,
+                        'domain2': domain2,
+                        'query': query,
+                        'judge_prompt_list': judge_prompt_list
+                    }
+                })
+        dataset = Dataset.from_list(raw_data)
+        return dataset
+
+
+def post_process_writingbench(judgement: dict):
+    """Input a string like below:
+
+    {"score": 9, "reason": "The response provides..."}, and extract the score
+    """
+    match = re.search(r"[\"']score[\"']:\s*([0-9]+)", judgement['prediction'])
+    if match:
+        score = int(match.group(1))
+    else:
+        return None
+
+    return {'score': score}
+
+
+@DICT_POSTPROCESSORS.register_module('writingbench')
+def writingbench_postprocess(output: dict, output_path: str) -> dict:
+    judged_answers, references = get_judgeanswer_and_reference(
+        output, output_path, post_process_writingbench)
+
+    if len(judged_answers) == 0:
+        scores = None
+
+    scores = defaultdict(list)
+    for ans, ref in zip(judged_answers, references):
+        domain = ref['domain1']
+        score = ans['score']
+        if score is not None:
+            scores['overall'].append(score)
+            scores[domain].append(score)
+    single_model_scores = {
+        task: sum(score) / len(score)
+        for task, score in scores.items()
+    }
+    results = single_model_scores
+    results['details'] = output
+    return results
--- a/opencompass/openicl/icl_evaluator/lm_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py
@ -116,6 +116,7 @@ class LMEvaluator:
        pred_postprocessor (ConfigDict): The model prediction's postprocessor
            config.
        keep_predictions (bool): Whether to save model predictions in references. Useful when postprocessor requires model predictions as input to calculate additional features (e.g. response length, markdown list counts, ...). Defaults to False.
+        multi_eval (bool): Whether to do multiple evaluation with different prompt settings.
    """

    def __init__(
@ -129,7 +130,9 @@ class LMEvaluator:
        pred_postprocessor: Optional[ConfigDict] = None,
        dict_postprocessor: Optional[ConfigDict] = None,
        keep_predictions: bool = False,
+        multi_eval: bool = False,
    ) -> None:
+        self.multi_eval = multi_eval
        self.output_path = output_path
        out_dir, out_name = osp.split(output_path)
        if not out_dir:
@ -209,6 +212,33 @@ class LMEvaluator:
                references = [
                    {} for _ in range(len(predictions[0]['model_preds']))
                ]
+            if self.multi_eval:
+                assert references is not None
+                assert 'judge_prompt_list' in references[0]
+                self.multi_eval_times = len(references[0]['judge_prompt_list'])
+                temp_predictions_save_list = []
+                for idx, pred in enumerate(predictions['model_preds']):
+                    for judge_prompt in references[idx]['judge_prompt_list']:
+                        temp_prediction = judge_prompt.replace(
+                            '{prediction}', pred)
+                        temp_predictions_save_list.append(temp_prediction)
+                predictions['model_preds'] = temp_predictions_save_list
+
+                temp_references_save_list = []
+                for item in references:
+                    new_item = {
+                        key: value
+                        for key, value in item.items()
+                        if key != 'judge_prompt_list'
+                    }
+                    if 'judge_prompt_list' in item:
+                        for prompt in item['judge_prompt_list']:
+                            temp_item = new_item.copy()
+                            temp_item['judge_prompt'] = prompt
+                            temp_references_save_list.append(temp_item)
+                    else:
+                        temp_references_save_list.append(item)
+                references = temp_references_save_list
            predictions = [predictions['model_preds']]

        # Due to the rarity of identical predictions, we have temporarily disabled the plagiarism detection feature.
@ -268,7 +298,12 @@ class LMEvaluator:

        if self.dataset_cfg:
            dataset = build_dataset_from_cfg(self.dataset_cfg)
-
+            if self.multi_eval:
+                new_ds = {
+                    k: dataset.test[k] * self.multi_eval_times
+                    for k in dataset.test.column_names
+                }
+                dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
            if infer_order == 'double':
                new_ds = {
                    k: dataset.test[k] * 2
@ -329,4 +364,4 @@ class LMEvaluator:
        else:
            kwargs = self.dict_postprocessor
            proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
-            return proc(output, self.output_path, **kwargs)
+            return proc(output, self.output_path, **kwargs)