From b93afe77649c9b7dcae486d5fe22f51544f42594 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <1487910649@qq.com> Date: Fri, 18 Apr 2025 09:21:01 +0000 Subject: [PATCH] add writingbench --- .../writingbench/writingbench_judge.py | 69 +++++++++++ opencompass/datasets/subjective/__init__.py | 1 + .../datasets/subjective/writingbench.py | 114 ++++++++++++++++++ .../openicl/icl_evaluator/lm_evaluator.py | 38 +++++- 4 files changed, 220 insertions(+), 2 deletions(-) create mode 100644 opencompass/configs/datasets/subjective/writingbench/writingbench_judge.py create mode 100644 opencompass/datasets/subjective/writingbench.py diff --git a/opencompass/configs/datasets/subjective/writingbench/writingbench_judge.py b/opencompass/configs/datasets/subjective/writingbench/writingbench_judge.py new file mode 100644 index 00000000..64e46172 --- /dev/null +++ b/opencompass/configs/datasets/subjective/writingbench/writingbench_judge.py @@ -0,0 +1,69 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import WritingBenchDataset, writingbench_postprocess +from mmengine.config import read_base + +subjective_reader_cfg = dict( + input_columns=['question'], + output_column='judge', + ) + +subjective_all_sets = [ + 'writingbench' +] + +writingbench_datasets = [] + +for _name in subjective_all_sets: + subjective_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='{question}' + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer,), + ) + + subjective_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + multi_eval=True, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are an expert evaluator with extensive experience in evaluating response of given query.") + ], + round=[ + dict( + role='HUMAN', + prompt = '{prediction}' + ), + ]), + ), + dict_postprocessor=dict(type=writingbench_postprocess), + ), + pred_role='BOT', + ) + + writingbench_datasets.append( + dict( + abbr=f'{_name}', + type=WritingBenchDataset, + path='./data/subjective/writingbench', + name=_name, + reader_cfg=subjective_reader_cfg, + infer_cfg=subjective_infer_cfg, + eval_cfg=subjective_eval_cfg, + mode='singlescore', + )) diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py index 8d663528..646c619e 100644 --- a/opencompass/datasets/subjective/__init__.py +++ b/opencompass/datasets/subjective/__init__.py @@ -35,3 +35,4 @@ from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403 from .wildbench import WildBenchDataset # noqa: F401, F403 from .wildbench import wildbench_bradleyterry_postprocess # noqa: F401, F403 from .wildbench import wildbench_postprocess # noqa: F401, F403 +from .writingbench import * \ No newline at end of file diff --git a/opencompass/datasets/subjective/writingbench.py b/opencompass/datasets/subjective/writingbench.py new file mode 100644 index 00000000..fb6b0a40 --- /dev/null +++ b/opencompass/datasets/subjective/writingbench.py @@ -0,0 +1,114 @@ +# flake8: noqa +import json +import os.path as osp +import re +from collections import defaultdict + +from datasets import Dataset + +from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from .utils import get_judgeanswer_and_reference + + +base_prompt = """Evaluate the Response based on the Query and criteria provided. + +** Criteria ** +```{criteria}``` + +** Query ** +```{question}``` + +** Response ** +```{prediction}``` + +Provide your evaluation based on the criteria: + +```{criteria}``` + +Provide reasons for each score, indicating where and why any strengths or deficiencies occur within the Response. Reference specific passages or elements from the text to support your justification. +Ensure that each reason is concrete, with explicit references to the text that aligns with the criteria requirements. + +Scoring Range: Assign an integer score between 1 to 10 + +** Output format ** +Return the results in the following JSON format, Only output this JSON format and nothing else: +```json +{{ + "score": an integer score between 1 to 10, + "reason": "Specific and detailed justification for the score using text elements." +}} +``` +""" + +@LOAD_DATASET.register_module() +class WritingBenchDataset(BaseDataset): + + def load(self, path: str, name: str, *args, **kwargs): + path = get_data_path(path, local_mode=True) + filename = osp.join(path, f'{name}.jsonl') + raw_data = [] + with open(filename, 'r', encoding='utf-8') as f: + for line in f: + data = json.loads(line) + domain1 = data['domain1'] + domain2 = data['domain2'] + query = data['query'] + criteria = data['criteria'] + judge_prompt_list = [] + for criteria_item in criteria: + temp_prompt = base_prompt.format(question=query, criteria=criteria_item, prediction='{prediction}') + judge_prompt_list.append(temp_prompt) + idx = data['index'] + raw_data.append({ + 'question': query, + 'judge': { + 'index': idx, + 'domain1': domain1, + 'domain2': domain2, + 'query': query, + 'judge_prompt_list': judge_prompt_list + } + }) + dataset = Dataset.from_list(raw_data) + return dataset + + +def post_process_writingbench(judgement: dict): + """Input a string like below: + + {"score": 9, "reason": "The response provides..."}, and extract the score + """ + match = re.search(r"[\"']score[\"']:\s*([0-9]+)", judgement['prediction']) + if match: + score = int(match.group(1)) + else: + return None + + return {'score': score} + + +@DICT_POSTPROCESSORS.register_module('writingbench') +def writingbench_postprocess(output: dict, output_path: str) -> dict: + judged_answers, references = get_judgeanswer_and_reference( + output, output_path, post_process_writingbench) + + if len(judged_answers) == 0: + scores = None + + scores = defaultdict(list) + for ans, ref in zip(judged_answers, references): + domain = ref['domain1'] + score = ans['score'] + if score is not None: + scores['overall'].append(score) + scores[domain].append(score) + single_model_scores = { + task: sum(score) / len(score) + for task, score in scores.items() + } + results = single_model_scores + results['details'] = output + return results diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py index 94f2cf94..4b105f21 100644 --- a/opencompass/openicl/icl_evaluator/lm_evaluator.py +++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py @@ -129,7 +129,9 @@ class LMEvaluator: pred_postprocessor: Optional[ConfigDict] = None, dict_postprocessor: Optional[ConfigDict] = None, keep_predictions: bool = False, + multi_eval=False, ) -> None: + self.multi_eval = multi_eval self.output_path = output_path out_dir, out_name = osp.split(output_path) if not out_dir: @@ -209,6 +211,33 @@ class LMEvaluator: references = [ {} for _ in range(len(predictions[0]['model_preds'])) ] + if self.multi_eval: + assert references is not None + assert 'judge_prompt_list' in references[0] + self.multi_eval_times = len(references[0]['judge_prompt_list']) + temp_predictions_save_list = [] + for idx, pred in enumerate(predictions['model_preds']): + for judge_prompt in references[idx]['judge_prompt_list']: + temp_prediction = judge_prompt.replace( + '{prediction}', pred) + temp_predictions_save_list.append(temp_prediction) + predictions['model_preds'] = temp_predictions_save_list + + temp_references_save_list = [] + for item in references: + new_item = { + key: value + for key, value in item.items() + if key != 'judge_prompt_list' + } + if 'judge_prompt_list' in item: + for prompt in item['judge_prompt_list']: + temp_item = new_item.copy() + temp_item['judge_prompt'] = prompt + temp_references_save_list.append(temp_item) + else: + temp_references_save_list.append(item) + references = temp_references_save_list predictions = [predictions['model_preds']] # Due to the rarity of identical predictions, we have temporarily disabled the plagiarism detection feature. @@ -268,7 +297,12 @@ class LMEvaluator: if self.dataset_cfg: dataset = build_dataset_from_cfg(self.dataset_cfg) - + if self.multi_eval: + new_ds = { + k: dataset.test[k] * self.multi_eval_times + for k in dataset.test.column_names + } + dataset.reader.dataset['test'] = Dataset.from_dict(new_ds) if infer_order == 'double': new_ds = { k: dataset.test[k] * 2 @@ -329,4 +363,4 @@ class LMEvaluator: else: kwargs = self.dict_postprocessor proc = DICT_POSTPROCESSORS.get(kwargs.pop('type')) - return proc(output, self.output_path, **kwargs) \ No newline at end of file + return proc(output, self.output_path, **kwargs)