From dae700e0b011af4629c3193040e0102bf6700497 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 14 Apr 2025 14:23:29 +0800 Subject: [PATCH] [Dataset] Add SeedBench Dataset --- dataset-index.yml | 5 + .../datasets/SeedBench/seedbench_gen.py | 5 + .../SeedBench/seedbench_gen_44868b.py | 77 ++++ opencompass/datasets/SeedBench.py | 340 ++++++++++++++++++ opencompass/datasets/__init__.py | 1 + 5 files changed, 428 insertions(+) create mode 100644 opencompass/configs/datasets/SeedBench/seedbench_gen.py create mode 100644 opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py create mode 100644 opencompass/datasets/SeedBench.py diff --git a/dataset-index.yml b/dataset-index.yml index f1581c21..e4dcca61 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -811,6 +811,11 @@ paper: https://arxiv.org/pdf/2407.13168 configpath: opencompass/configs/datasets/scicode/scicode_gen.py configpath_llmjudge: '' +- seedbench: + name: SeedBench + category: Knowledge + paper: '' + configpath: opencompass/configs/datasets/SeedBench/seedbench_gen.py - simpleqa: name: SimpleQA category: Knowledge diff --git a/opencompass/configs/datasets/SeedBench/seedbench_gen.py b/opencompass/configs/datasets/SeedBench/seedbench_gen.py new file mode 100644 index 00000000..0714869a --- /dev/null +++ b/opencompass/configs/datasets/SeedBench/seedbench_gen.py @@ -0,0 +1,5 @@ +from mmengine.config import read_base + +with read_base(): + # Default use LLM as a judge + from .seedbench_gen_44868b import seedbench_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py b/opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py new file mode 100644 index 00000000..ba053fc9 --- /dev/null +++ b/opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py @@ -0,0 +1,77 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator, JiebaRougeEvaluator, RougeEvaluator +from opencompass.datasets.SeedBench import SeedBenchDataset, F1ScoreEvaluator, my_multiple_select_postprocess, AverageRougeScoreEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + + +agri_data_dir = './data/SeedBench' + +agri_reader_cfg = dict( + input_columns=['instruction', 'question'], + output_column='answer' + ) + +agri_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='{instruction}\n{question}\n' + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + + +default_dataset_cfg = { + 'type': SeedBenchDataset, + 'path': 'json', + 'reader_cfg': agri_reader_cfg, + 'infer_cfg': agri_infer_cfg, +} + +dataset_configs = [ + # 1-n + {'abbr': 'seedbench_1-1', 'data_file': '1-1.json', 'evaluator': 'AccEvaluator', + 'pred_postprocessor': dict(type=first_option_postprocess, options='ABCD')}, + {'abbr': 'seedbench_1-2', 'data_file': '1-2.json', 'evaluator': 'F1ScoreEvaluator', + 'pred_postprocessor': dict(type=my_multiple_select_postprocess)}, + # {'abbr': 'seedbench_1-3_em', 'data_file': '1-3.json', 'evaluator': 'ExactMatchScoreEvaluator'}, + {'abbr': 'seedbench_1-3', 'data_file': '1-3.json', 'evaluator': 'AverageRougeScoreEvaluator'}, + {'abbr': 'seedbench_1-4', 'data_file': '1-4.json', 'evaluator': 'RougeEvaluator'}, + # # 2-n + {'abbr': 'seedbench_2-1', 'data_file': '2-1.json', 'evaluator': 'RougeEvaluator'}, + {'abbr': 'seedbench_2-2', 'data_file': '2-2.json', 'evaluator': 'RougeEvaluator'}, + # 3-n + {'abbr': 'seedbench_3-1', 'data_file': '3-1.json', 'evaluator': 'AccEvaluator', + 'pred_postprocessor': dict(type=first_option_postprocess, options='ABCD')}, + {'abbr': 'seedbench_3-2', 'data_file': '3-2.json', 'evaluator': 'F1ScoreEvaluator', + 'pred_postprocessor': dict(type=my_multiple_select_postprocess)}, + # {'abbr': 'seedbench_3-3_em', 'data_file': '3-3.json', 'evaluator': 'ExactMatchScoreEvaluator'}, + {'abbr': 'seedbench_3-3', 'data_file': '3-3.json', 'evaluator': 'AverageRougeScoreEvaluator'}, + {'abbr': 'seedbench_3-4', 'data_file': '3-4.json', 'evaluator': 'RougeEvaluator'}, + {'abbr': 'seedbench_3-5', 'data_file': '3-5.json', 'evaluator': 'AccScoreStr_Evaluator'}, +] + + +seedbench_datasets = [] +for stage in ['zero-shot','one-shot']: + for config in dataset_configs: + eval_cfg = dict( + evaluator=dict(type=config['evaluator']) + ) + if 'pred_postprocessor' in config: + eval_cfg['pred_postprocessor'] = config['pred_postprocessor'] + data_file = f"{agri_data_dir}/{stage}/{config['data_file']}" + abbr_name = f"{config['abbr']}_{stage}" + seedbench_datasets.append( + dict( + type=SeedBenchDataset, + abbr=abbr_name, + data_files=data_file, + path='json', + reader_cfg=agri_reader_cfg, + infer_cfg=agri_infer_cfg, + eval_cfg=eval_cfg + ) + ) diff --git a/opencompass/datasets/SeedBench.py b/opencompass/datasets/SeedBench.py new file mode 100644 index 00000000..fa8bd962 --- /dev/null +++ b/opencompass/datasets/SeedBench.py @@ -0,0 +1,340 @@ +import os +import random +import datasets +from typing import List +from .base import BaseDataset +from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator +import numpy as np +import re +import jieba +from rouge_chinese import Rouge +from opencompass.registry import ICL_EVALUATORS, TEXT_POSTPROCESSORS + + +class SeedBenchDataset(BaseDataset): + @staticmethod + def load(data_files: str, path: str = 'json', split: str = None, **kwargs) -> datasets.Dataset: + dataset = datasets.load_dataset(path, data_files=data_files, **kwargs) + + if split is None: + split = list(dataset.keys())[0] + print(f"my datasets split : {split}") + + if split not in dataset: + raise ValueError(f"Split '{split}' not found. Available splits: {list(dataset.keys())}") + + return dataset[split] + + +class F1Evaluator(BaseEvaluator): + """F1 Score evaluator for multiple choice questions. + + Args: + seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + """ + + def __init__(self, seed: int = 0) -> None: + self.seed = seed + super().__init__() + + def _preprocess(self, predictions: List, references: List) -> dict: + """Preprocess the final predictions and references to needed format. + + Args: + predictions (List): List of predictions for each sample. + references (List): List of reference answers for each sample. + + Returns: + dict: Preprocessed predictions and references in the required format. + """ + return { + 'predictions': predictions, + 'references': references, + } + + def _postprocess(self, scores: dict) -> dict: + """Postprocess the final score for F1. + + Args: + scores (dict): Dictionary of calculated F1 score. + + Returns: + dict: Postprocessed F1 score. + """ + return scores + + def score(self, predictions: List, references: List) -> dict: + """Calculate F1 score. + + Args: + predictions (List): List of predicted answers for each sample. + references (List): List of reference answers for each sample. + + Returns: + dict: Calculated F1 score. + """ + random_state = random.getstate() + np_random_state = np.random.get_state() + details = [] + + random.seed(self.seed) + np.random.seed(self.seed) + + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + true_positives = 0 + false_positives = 0 + false_negatives = 0 + + for hyp, ref in zip(predictions, references): + hyp = re.sub(r'[^A-Da-d,]+', '', hyp.lower()) + ref = re.sub(r'[^A-Da-d,]+', '', ref.lower()) + ref_set = set(ref.split(',')) + hyp_set = set(hyp.split(',')) + ref_set = {r.strip() for r in ref_set} + hyp_set = {h.strip() for h in hyp_set} + + sample_tp = len(hyp_set.intersection(ref_set)) + sample_fp = len(hyp_set - ref_set) + sample_fn = len(ref_set - hyp_set) + true_positives += sample_tp + false_positives += sample_fp + false_negatives += sample_fn + sample_precision = sample_tp / (sample_tp + sample_fp) if (sample_tp + sample_fp) > 0 else 0 + sample_recall = sample_tp / (sample_tp + sample_fn) if (sample_tp + sample_fn) > 0 else 0 + sample_f1 = (2 * sample_precision * sample_recall) / (sample_precision + sample_recall) if (sample_precision + sample_recall) > 0 else 0 + details.append({'pred': hyp, 'answer': ref, 'correct': sample_f1 * 100}) + + precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0 + recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0 + f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 + + result = { + "ours_F1Score": f1 * 100, # 总体 F1 分数 + "details": details + } + random.setstate(random_state) + np.random.set_state(np_random_state) + return self._postprocess(result) + +@ICL_EVALUATORS.register_module() +class F1ScoreEvaluator(F1Evaluator): + """F1 Score evaluator for multiple choice questions.""" + def __init__(self) -> None: + super().__init__() + + +# 定义自己的多选后处理逻辑(输入回答为:ABC ---> A,B,C) +@TEXT_POSTPROCESSORS.register_module('my_multiple_select_postprocess') +def my_multiple_select_postprocess(text: str) -> str: + selected_options = [t for t in text if t.isupper()] + selected_options = sorted(set(selected_options)) + res = ', '.join(selected_options) + return res + + +class AverageRougeEvaluator(BaseEvaluator): + """Average Rouge Score evaluator for fill-in-the-blank tasks. + + Args: + seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + """ + + def __init__(self, seed: int = 0) -> None: + self.seed = seed + super().__init__() + + def _preprocess(self, predictions: List, references: List) -> dict: + """Preprocess the final predictions and references to needed format. + + Args: + predictions (List): List of predictions for each sample. + references (List): List of reference answers for each sample. + + Returns: + dict: Preprocessed predictions and references in the required format. + """ + pattern = r"(正确答案[::]|correct answer[::])" + cleaned_predictions = [re.sub(pattern, "", pred, flags=re.IGNORECASE).strip() for pred in predictions] + + return { + 'predictions': cleaned_predictions, + 'references': references, + } + + def _postprocess(self, scores: dict) -> dict: + """Postprocess the final Rouge scores. + + Args: + scores (dict): Dictionary of calculated average Rouge scores. + + Returns: + dict: Postprocessed Rouge scores. + """ + return scores + + def score(self, predictions: List, references: List) -> dict: + """Calculate average Rouge-L score. + + Args: + predictions (List): List of predicted strings for each sample. + references (List): List of reference strings for each sample. + + Returns: + dict: Calculated average Rouge-L score. + """ + def rouge_score(hyps, refs): + assert(len(hyps) == len(refs)) + hyps = [' '.join(jieba.cut(h)) for h in hyps] + hyps = [h if h.strip() != "" else "无内容" for h in hyps] + refs = [' '.join(jieba.cut(r)) for r in refs] + rouge_scores = Rouge().get_scores(hyps, refs) + rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] + average_rouge_l = sum(rouge_ls) / len(rouge_ls) + return {"score": average_rouge_l * 100} + + random_state = random.getstate() + np_random_state = np.random.get_state() + details = [] + random.seed(self.seed) + np.random.seed(self.seed) + + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + preprocessed_data = self._preprocess(predictions, references) + hyps, refs = preprocessed_data['predictions'], preprocessed_data['references'] + + scores = [] + for i in range(len(hyps)): + refs[i] = refs[i].replace(',', ',') + word_level_refs = refs[i].split(',') + word_level_refs = [r.strip() for r in word_level_refs] + if len(word_level_refs) == 1: + word_level_hyps = [hyps[i]] + else: + word_level_hyps = hyps[i].split(',') + word_level_hyps = [h.strip() for h in word_level_hyps] + + if len(word_level_hyps) < len(word_level_refs): + word_level_hyps += ['无内容'] * (len(word_level_refs) - len(word_level_hyps)) + else: + word_level_hyps = word_level_hyps[:len(word_level_refs)] + + sample_score = rouge_score(word_level_hyps, word_level_refs)["score"] + scores.append(sample_score) + details.append({'pred': word_level_hyps, 'answer': word_level_refs, 'correct': sample_score}) + + average_score = sum(scores) / len(scores) + result = { + "AvgRougeScore": average_score, + "details": details + } + random.setstate(random_state) + np.random.set_state(np_random_state) + + return self._postprocess(result) + + +@ICL_EVALUATORS.register_module() +class AverageRougeScoreEvaluator(AverageRougeEvaluator): + """Average Rouge Score evaluator.""" + + def __init__(self) -> None: + super().__init__() + + +class AccScoreStrEvaluator(BaseEvaluator): + """Accuracy evaluator based on string matching. + + Args: + seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + """ + + def __init__(self, seed: int = 0) -> None: + self.seed = seed + super().__init__() + + def _preprocess(self, predictions: List, references: List) -> dict: + """Preprocess the final predictions and references to needed format. + + Args: + predictions (List): List of predictions for each sample. + references (List): List of reference answers for each sample. + + Returns: + dict: Preprocessed predictions and references in the required format. + """ + return { + 'predictions': predictions, + 'references': references, + } + + def _postprocess(self, scores: dict) -> dict: + """Postprocess the final accuracy score. + + Args: + scores (dict): Dictionary of calculated accuracy score. + + Returns: + dict: Postprocessed accuracy score. + """ + return scores + + def score(self, predictions: List, references: List) -> dict: + """Calculate accuracy score. + + Args: + predictions (List): List of predicted strings for each sample. + references (List): List of reference strings for each sample. + + Returns: + dict: Calculated accuracy score. + """ + random_state = random.getstate() + np_random_state = np.random.get_state() + details = [] + random.seed(self.seed) + np.random.seed(self.seed) + + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + preprocessed_data = self._preprocess(predictions, references) + + correct = 0 + for hyp, ref in zip(preprocessed_data['predictions'], preprocessed_data['references']): + is_correct = 1 if ref.strip().lower() in hyp.strip().lower() else 0 + correct += is_correct + details.append({'pred': hyp, 'answer': ref, 'correct': is_correct}) + + accuracy = correct / len(predictions) + result = { + "ACCStrScore": accuracy * 100, + "details": details + } + random.setstate(random_state) + np.random.set_state(np_random_state) + + return self._postprocess(result) + + +@ICL_EVALUATORS.register_module() +class AccScoreStr_Evaluator(AccScoreStrEvaluator): + """Accuracy evaluator wrapper for the AccScoreEvaluator.""" + + def __init__(self) -> None: + super().__init__() diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 3e2d0eef..9fdf2411 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -123,6 +123,7 @@ from .ruler import * # noqa: F401, F403 from .safety import * # noqa: F401, F403 from .scibench import ScibenchDataset, scibench_postprocess # noqa: F401, F403 from .scicode import * # noqa: F401, F403 +from .SeedBench import * # noqa: F401, F403 from .simpleqa import * # noqa: F401, F403 from .siqa import * # noqa: F401, F403 from .squad20 import SQuAD20Dataset, SQuAD20Evaluator # noqa: F401, F403