From dae700e0b011af4629c3193040e0102bf6700497 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 14 Apr 2025 14:23:29 +0800 Subject: [PATCH 1/6] [Dataset] Add SeedBench Dataset --- dataset-index.yml | 5 + .../datasets/SeedBench/seedbench_gen.py | 5 + .../SeedBench/seedbench_gen_44868b.py | 77 ++++ opencompass/datasets/SeedBench.py | 340 ++++++++++++++++++ opencompass/datasets/__init__.py | 1 + 5 files changed, 428 insertions(+) create mode 100644 opencompass/configs/datasets/SeedBench/seedbench_gen.py create mode 100644 opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py create mode 100644 opencompass/datasets/SeedBench.py diff --git a/dataset-index.yml b/dataset-index.yml index f1581c21..e4dcca61 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -811,6 +811,11 @@ paper: https://arxiv.org/pdf/2407.13168 configpath: opencompass/configs/datasets/scicode/scicode_gen.py configpath_llmjudge: '' +- seedbench: + name: SeedBench + category: Knowledge + paper: '' + configpath: opencompass/configs/datasets/SeedBench/seedbench_gen.py - simpleqa: name: SimpleQA category: Knowledge diff --git a/opencompass/configs/datasets/SeedBench/seedbench_gen.py b/opencompass/configs/datasets/SeedBench/seedbench_gen.py new file mode 100644 index 00000000..0714869a --- /dev/null +++ b/opencompass/configs/datasets/SeedBench/seedbench_gen.py @@ -0,0 +1,5 @@ +from mmengine.config import read_base + +with read_base(): + # Default use LLM as a judge + from .seedbench_gen_44868b import seedbench_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py b/opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py new file mode 100644 index 00000000..ba053fc9 --- /dev/null +++ b/opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py @@ -0,0 +1,77 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator, JiebaRougeEvaluator, RougeEvaluator +from opencompass.datasets.SeedBench import SeedBenchDataset, F1ScoreEvaluator, my_multiple_select_postprocess, AverageRougeScoreEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + + +agri_data_dir = './data/SeedBench' + +agri_reader_cfg = dict( + input_columns=['instruction', 'question'], + output_column='answer' + ) + +agri_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='{instruction}\n{question}\n' + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + + +default_dataset_cfg = { + 'type': SeedBenchDataset, + 'path': 'json', + 'reader_cfg': agri_reader_cfg, + 'infer_cfg': agri_infer_cfg, +} + +dataset_configs = [ + # 1-n + {'abbr': 'seedbench_1-1', 'data_file': '1-1.json', 'evaluator': 'AccEvaluator', + 'pred_postprocessor': dict(type=first_option_postprocess, options='ABCD')}, + {'abbr': 'seedbench_1-2', 'data_file': '1-2.json', 'evaluator': 'F1ScoreEvaluator', + 'pred_postprocessor': dict(type=my_multiple_select_postprocess)}, + # {'abbr': 'seedbench_1-3_em', 'data_file': '1-3.json', 'evaluator': 'ExactMatchScoreEvaluator'}, + {'abbr': 'seedbench_1-3', 'data_file': '1-3.json', 'evaluator': 'AverageRougeScoreEvaluator'}, + {'abbr': 'seedbench_1-4', 'data_file': '1-4.json', 'evaluator': 'RougeEvaluator'}, + # # 2-n + {'abbr': 'seedbench_2-1', 'data_file': '2-1.json', 'evaluator': 'RougeEvaluator'}, + {'abbr': 'seedbench_2-2', 'data_file': '2-2.json', 'evaluator': 'RougeEvaluator'}, + # 3-n + {'abbr': 'seedbench_3-1', 'data_file': '3-1.json', 'evaluator': 'AccEvaluator', + 'pred_postprocessor': dict(type=first_option_postprocess, options='ABCD')}, + {'abbr': 'seedbench_3-2', 'data_file': '3-2.json', 'evaluator': 'F1ScoreEvaluator', + 'pred_postprocessor': dict(type=my_multiple_select_postprocess)}, + # {'abbr': 'seedbench_3-3_em', 'data_file': '3-3.json', 'evaluator': 'ExactMatchScoreEvaluator'}, + {'abbr': 'seedbench_3-3', 'data_file': '3-3.json', 'evaluator': 'AverageRougeScoreEvaluator'}, + {'abbr': 'seedbench_3-4', 'data_file': '3-4.json', 'evaluator': 'RougeEvaluator'}, + {'abbr': 'seedbench_3-5', 'data_file': '3-5.json', 'evaluator': 'AccScoreStr_Evaluator'}, +] + + +seedbench_datasets = [] +for stage in ['zero-shot','one-shot']: + for config in dataset_configs: + eval_cfg = dict( + evaluator=dict(type=config['evaluator']) + ) + if 'pred_postprocessor' in config: + eval_cfg['pred_postprocessor'] = config['pred_postprocessor'] + data_file = f"{agri_data_dir}/{stage}/{config['data_file']}" + abbr_name = f"{config['abbr']}_{stage}" + seedbench_datasets.append( + dict( + type=SeedBenchDataset, + abbr=abbr_name, + data_files=data_file, + path='json', + reader_cfg=agri_reader_cfg, + infer_cfg=agri_infer_cfg, + eval_cfg=eval_cfg + ) + ) diff --git a/opencompass/datasets/SeedBench.py b/opencompass/datasets/SeedBench.py new file mode 100644 index 00000000..fa8bd962 --- /dev/null +++ b/opencompass/datasets/SeedBench.py @@ -0,0 +1,340 @@ +import os +import random +import datasets +from typing import List +from .base import BaseDataset +from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator +import numpy as np +import re +import jieba +from rouge_chinese import Rouge +from opencompass.registry import ICL_EVALUATORS, TEXT_POSTPROCESSORS + + +class SeedBenchDataset(BaseDataset): + @staticmethod + def load(data_files: str, path: str = 'json', split: str = None, **kwargs) -> datasets.Dataset: + dataset = datasets.load_dataset(path, data_files=data_files, **kwargs) + + if split is None: + split = list(dataset.keys())[0] + print(f"my datasets split : {split}") + + if split not in dataset: + raise ValueError(f"Split '{split}' not found. Available splits: {list(dataset.keys())}") + + return dataset[split] + + +class F1Evaluator(BaseEvaluator): + """F1 Score evaluator for multiple choice questions. + + Args: + seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + """ + + def __init__(self, seed: int = 0) -> None: + self.seed = seed + super().__init__() + + def _preprocess(self, predictions: List, references: List) -> dict: + """Preprocess the final predictions and references to needed format. + + Args: + predictions (List): List of predictions for each sample. + references (List): List of reference answers for each sample. + + Returns: + dict: Preprocessed predictions and references in the required format. + """ + return { + 'predictions': predictions, + 'references': references, + } + + def _postprocess(self, scores: dict) -> dict: + """Postprocess the final score for F1. + + Args: + scores (dict): Dictionary of calculated F1 score. + + Returns: + dict: Postprocessed F1 score. + """ + return scores + + def score(self, predictions: List, references: List) -> dict: + """Calculate F1 score. + + Args: + predictions (List): List of predicted answers for each sample. + references (List): List of reference answers for each sample. + + Returns: + dict: Calculated F1 score. + """ + random_state = random.getstate() + np_random_state = np.random.get_state() + details = [] + + random.seed(self.seed) + np.random.seed(self.seed) + + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + true_positives = 0 + false_positives = 0 + false_negatives = 0 + + for hyp, ref in zip(predictions, references): + hyp = re.sub(r'[^A-Da-d,]+', '', hyp.lower()) + ref = re.sub(r'[^A-Da-d,]+', '', ref.lower()) + ref_set = set(ref.split(',')) + hyp_set = set(hyp.split(',')) + ref_set = {r.strip() for r in ref_set} + hyp_set = {h.strip() for h in hyp_set} + + sample_tp = len(hyp_set.intersection(ref_set)) + sample_fp = len(hyp_set - ref_set) + sample_fn = len(ref_set - hyp_set) + true_positives += sample_tp + false_positives += sample_fp + false_negatives += sample_fn + sample_precision = sample_tp / (sample_tp + sample_fp) if (sample_tp + sample_fp) > 0 else 0 + sample_recall = sample_tp / (sample_tp + sample_fn) if (sample_tp + sample_fn) > 0 else 0 + sample_f1 = (2 * sample_precision * sample_recall) / (sample_precision + sample_recall) if (sample_precision + sample_recall) > 0 else 0 + details.append({'pred': hyp, 'answer': ref, 'correct': sample_f1 * 100}) + + precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0 + recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0 + f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 + + result = { + "ours_F1Score": f1 * 100, # 总体 F1 分数 + "details": details + } + random.setstate(random_state) + np.random.set_state(np_random_state) + return self._postprocess(result) + +@ICL_EVALUATORS.register_module() +class F1ScoreEvaluator(F1Evaluator): + """F1 Score evaluator for multiple choice questions.""" + def __init__(self) -> None: + super().__init__() + + +# 定义自己的多选后处理逻辑(输入回答为:ABC ---> A,B,C) +@TEXT_POSTPROCESSORS.register_module('my_multiple_select_postprocess') +def my_multiple_select_postprocess(text: str) -> str: + selected_options = [t for t in text if t.isupper()] + selected_options = sorted(set(selected_options)) + res = ', '.join(selected_options) + return res + + +class AverageRougeEvaluator(BaseEvaluator): + """Average Rouge Score evaluator for fill-in-the-blank tasks. + + Args: + seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + """ + + def __init__(self, seed: int = 0) -> None: + self.seed = seed + super().__init__() + + def _preprocess(self, predictions: List, references: List) -> dict: + """Preprocess the final predictions and references to needed format. + + Args: + predictions (List): List of predictions for each sample. + references (List): List of reference answers for each sample. + + Returns: + dict: Preprocessed predictions and references in the required format. + """ + pattern = r"(正确答案[::]|correct answer[::])" + cleaned_predictions = [re.sub(pattern, "", pred, flags=re.IGNORECASE).strip() for pred in predictions] + + return { + 'predictions': cleaned_predictions, + 'references': references, + } + + def _postprocess(self, scores: dict) -> dict: + """Postprocess the final Rouge scores. + + Args: + scores (dict): Dictionary of calculated average Rouge scores. + + Returns: + dict: Postprocessed Rouge scores. + """ + return scores + + def score(self, predictions: List, references: List) -> dict: + """Calculate average Rouge-L score. + + Args: + predictions (List): List of predicted strings for each sample. + references (List): List of reference strings for each sample. + + Returns: + dict: Calculated average Rouge-L score. + """ + def rouge_score(hyps, refs): + assert(len(hyps) == len(refs)) + hyps = [' '.join(jieba.cut(h)) for h in hyps] + hyps = [h if h.strip() != "" else "无内容" for h in hyps] + refs = [' '.join(jieba.cut(r)) for r in refs] + rouge_scores = Rouge().get_scores(hyps, refs) + rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] + average_rouge_l = sum(rouge_ls) / len(rouge_ls) + return {"score": average_rouge_l * 100} + + random_state = random.getstate() + np_random_state = np.random.get_state() + details = [] + random.seed(self.seed) + np.random.seed(self.seed) + + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + preprocessed_data = self._preprocess(predictions, references) + hyps, refs = preprocessed_data['predictions'], preprocessed_data['references'] + + scores = [] + for i in range(len(hyps)): + refs[i] = refs[i].replace(',', ',') + word_level_refs = refs[i].split(',') + word_level_refs = [r.strip() for r in word_level_refs] + if len(word_level_refs) == 1: + word_level_hyps = [hyps[i]] + else: + word_level_hyps = hyps[i].split(',') + word_level_hyps = [h.strip() for h in word_level_hyps] + + if len(word_level_hyps) < len(word_level_refs): + word_level_hyps += ['无内容'] * (len(word_level_refs) - len(word_level_hyps)) + else: + word_level_hyps = word_level_hyps[:len(word_level_refs)] + + sample_score = rouge_score(word_level_hyps, word_level_refs)["score"] + scores.append(sample_score) + details.append({'pred': word_level_hyps, 'answer': word_level_refs, 'correct': sample_score}) + + average_score = sum(scores) / len(scores) + result = { + "AvgRougeScore": average_score, + "details": details + } + random.setstate(random_state) + np.random.set_state(np_random_state) + + return self._postprocess(result) + + +@ICL_EVALUATORS.register_module() +class AverageRougeScoreEvaluator(AverageRougeEvaluator): + """Average Rouge Score evaluator.""" + + def __init__(self) -> None: + super().__init__() + + +class AccScoreStrEvaluator(BaseEvaluator): + """Accuracy evaluator based on string matching. + + Args: + seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + """ + + def __init__(self, seed: int = 0) -> None: + self.seed = seed + super().__init__() + + def _preprocess(self, predictions: List, references: List) -> dict: + """Preprocess the final predictions and references to needed format. + + Args: + predictions (List): List of predictions for each sample. + references (List): List of reference answers for each sample. + + Returns: + dict: Preprocessed predictions and references in the required format. + """ + return { + 'predictions': predictions, + 'references': references, + } + + def _postprocess(self, scores: dict) -> dict: + """Postprocess the final accuracy score. + + Args: + scores (dict): Dictionary of calculated accuracy score. + + Returns: + dict: Postprocessed accuracy score. + """ + return scores + + def score(self, predictions: List, references: List) -> dict: + """Calculate accuracy score. + + Args: + predictions (List): List of predicted strings for each sample. + references (List): List of reference strings for each sample. + + Returns: + dict: Calculated accuracy score. + """ + random_state = random.getstate() + np_random_state = np.random.get_state() + details = [] + random.seed(self.seed) + np.random.seed(self.seed) + + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + preprocessed_data = self._preprocess(predictions, references) + + correct = 0 + for hyp, ref in zip(preprocessed_data['predictions'], preprocessed_data['references']): + is_correct = 1 if ref.strip().lower() in hyp.strip().lower() else 0 + correct += is_correct + details.append({'pred': hyp, 'answer': ref, 'correct': is_correct}) + + accuracy = correct / len(predictions) + result = { + "ACCStrScore": accuracy * 100, + "details": details + } + random.setstate(random_state) + np.random.set_state(np_random_state) + + return self._postprocess(result) + + +@ICL_EVALUATORS.register_module() +class AccScoreStr_Evaluator(AccScoreStrEvaluator): + """Accuracy evaluator wrapper for the AccScoreEvaluator.""" + + def __init__(self) -> None: + super().__init__() diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 3e2d0eef..9fdf2411 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -123,6 +123,7 @@ from .ruler import * # noqa: F401, F403 from .safety import * # noqa: F401, F403 from .scibench import ScibenchDataset, scibench_postprocess # noqa: F401, F403 from .scicode import * # noqa: F401, F403 +from .SeedBench import * # noqa: F401, F403 from .simpleqa import * # noqa: F401, F403 from .siqa import * # noqa: F401, F403 from .squad20 import SQuAD20Dataset, SQuAD20Evaluator # noqa: F401, F403 From f9b1636598c9f81e502aef98abdb1554357e21ce Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 14 Apr 2025 19:51:01 +0800 Subject: [PATCH 2/6] docs: add README for SeedBench --- .../configs/datasets/SeedBench/README.md | 79 +++++++++++++++++++ opencompass/datasets/SeedBench.py | 1 + opencompass/utils/datasets_info.py | 6 ++ 3 files changed, 86 insertions(+) create mode 100644 opencompass/configs/datasets/SeedBench/README.md diff --git a/opencompass/configs/datasets/SeedBench/README.md b/opencompass/configs/datasets/SeedBench/README.md new file mode 100644 index 00000000..c78f852a --- /dev/null +++ b/opencompass/configs/datasets/SeedBench/README.md @@ -0,0 +1,79 @@ +## 🌾 About SeedBench + +**SeedBench** is the first multi-task benchmark designed to evaluate large language models (LLMs) in seed science, focusing on seed breeding. This repository includes the dataset, evaluation code, and documentation to support research in this domain. + +SeedBench assesses LLMs across three core seed breeding stages: +- **Gene Information Retrieval** +- **Gene Function and Regulation Analysis** +- **Variety Breeding with Agronomic Trait Optimization** + +Built with domain experts, SeedBench features **2,264 expert-validated questions** across 11 task types and 10 subcategories, initially targeting rice breeding. Future updates will include other crops like maize, soybean, and wheat. + +--- + +## 🔎 Dataset Details + +- **Corpus**: 308,727 publications cleaned to 1.1 billion tokens; 279 segments from 113 documents. +- **Questions**: 2,264 across 11 task types, bilingual (English/Chinese), expert-validated. +- **Focus**: Rice breeding as a representative case. + + **Types and metrics:** + + +
+ + | Type ID | Question Type | Metric | Count | + |---------|----------------------------|----------|-------| + | **Q&A** | | | | + | QA-1 | Multiple Choice | Accuracy | 200 | + | QA-2 | Multiple Answer | Macro-F1 | 187 | + | QA-3 | Fill-in-the-Blank | ROUGE-L | 224 | + | QA-4 | Generation | ROUGE-L | 242 | + | **Summarization** | | | | + | SUM-1 | Simple Summarization | ROUGE-L | 225 | + | SUM-2 | Key Information Extraction | ROUGE-L | 225 | + | **Reading Comprehension** | | | | + | RC-1 | Multiple Choice | Accuracy | 113 | + | RC-2 | Multiple Answer | Macro-F1 | 108 | + | RC-3 | Fill-in-the-Blank | ROUGE-L | 221 | + | RC-4 | Generation | ROUGE-L | 240 | + | RC-5 | Subcategory Classification | Accuracy | 279 | + +
+ + +--- + +## 📂 Dataset Links + +- [SeedBench on Github](https://github.com/open-sciencelab/SeedBench) +- [SeedBench on Hugging Face](https://huggingface.co/datasets/yj12869741/SeedBench) +- [SeedBench on ModelScope](https://www.modelscope.cn/datasets/y12869741/SeedBench/summary) + +--- + +## ☀️ Key Results + +We evaluated 26 LLMs, including proprietary, open-source, and domain-specific models. Highlights: + +### Performance by Question Type + +- **Top Performers**: DeepSeek-V3 (68.37), GPT-4 (67.88). + +### Performance by Task Types + +| Model | QA-1 | QA-2 | QA-3 | QA-4 | SUM-1 | SUM-2 | RC-1 | RC-2 | RC-3 | RC-4 | RC-5 | Avg | +|------------------|------|------|------|------|-------|-------|------|------|------|------|------|------| +| GPT-4 | 60.50| 73.87| 21.35| 36.07| 58.73 | 62.89 | 100.00| 96.44| 87.86| 62.29| 86.74| 67.88| +| DeepSeek-V3 | 72.50| 79.84| 29.29| 40.63| 48.06 | 54.67 | 100.00| 97.22| 87.89| 55.19| 86.74| 68.37| +| Qwen2-72B | 59.50| 75.98| 19.55| 31.62| 31.08 | 63.09 | 99.12 | 94.24| 72.20| 51.58| 89.96| 62.54| + +### Performance by Subcategory + +| Model | C1 | C2 | C3 | C4 | C5 | C6 | C7 | C8 | C9 | C10 | Avg | +|-------------------|------|------|------|------|------|------|------|------|------|------|------| +| GPT-4 | 59.59| 60.55| 76.32| 61.16| 56.34| 59.35| 63.67| 64.74| 60.65| 67.66| 62.06| +| DeepSeek-V3-671B | 56.03| 62.42| 74.81| 63.17| 55.23| 58.84| 68.23| 69.04| 66.46| 68.48| 63.30| +| Qwen2-72B | 51.16| 58.10| 74.07| 59.72| 51.58| 57.76| 58.85| 61.63| 56.69| 59.11| 57.62| + +- **Top Performers**: DeepSeek-V3-671B (63.30), GPT-4 (62.06). \ No newline at end of file diff --git a/opencompass/datasets/SeedBench.py b/opencompass/datasets/SeedBench.py index fa8bd962..f7c97b5d 100644 --- a/opencompass/datasets/SeedBench.py +++ b/opencompass/datasets/SeedBench.py @@ -121,6 +121,7 @@ class F1Evaluator(BaseEvaluator): random.setstate(random_state) np.random.set_state(np_random_state) return self._postprocess(result) + @ICL_EVALUATORS.register_module() class F1ScoreEvaluator(F1Evaluator): diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 00db25e8..a6059e40 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -229,6 +229,12 @@ DATASETS_MAPPING = { "hf_id": "opencompass/race", "local": "./data/race/", }, + # SeedBench + "opencompass/seedbench": { + "ms_id": "", + "hf_id": "y12869741/SeedBench", + "local": "./data/SeedBench", + }, # SIQA "opencompass/siqa": { "ms_id": "opencompass/siqa", From db04df78d4e0ab858e021f00ec47d8066178f7c6 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 14 Apr 2025 21:20:29 +0800 Subject: [PATCH 3/6] refactor: delete unnecessary comment --- .../configs/datasets/SeedBench/README.md | 2 +- ..._gen_44868b.py => seedbench_gen_5d5ea1.py} | 1 - opencompass/datasets/SeedBench.py | 215 +++++++----------- 3 files changed, 85 insertions(+), 133 deletions(-) rename opencompass/configs/datasets/SeedBench/{seedbench_gen_44868b.py => seedbench_gen_5d5ea1.py} (99%) diff --git a/opencompass/configs/datasets/SeedBench/README.md b/opencompass/configs/datasets/SeedBench/README.md index c78f852a..0963db24 100644 --- a/opencompass/configs/datasets/SeedBench/README.md +++ b/opencompass/configs/datasets/SeedBench/README.md @@ -76,4 +76,4 @@ We evaluated 26 LLMs, including proprietary, open-source, and domain-specific mo | DeepSeek-V3-671B | 56.03| 62.42| 74.81| 63.17| 55.23| 58.84| 68.23| 69.04| 66.46| 68.48| 63.30| | Qwen2-72B | 51.16| 58.10| 74.07| 59.72| 51.58| 57.76| 58.85| 61.63| 56.69| 59.11| 57.62| -- **Top Performers**: DeepSeek-V3-671B (63.30), GPT-4 (62.06). \ No newline at end of file +- **Top Performers**: DeepSeek-V3-671B (63.30), GPT-4 (62.06). diff --git a/opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py b/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py similarity index 99% rename from opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py rename to opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py index ba053fc9..07661901 100644 --- a/opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py +++ b/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py @@ -53,7 +53,6 @@ dataset_configs = [ {'abbr': 'seedbench_3-5', 'data_file': '3-5.json', 'evaluator': 'AccScoreStr_Evaluator'}, ] - seedbench_datasets = [] for stage in ['zero-shot','one-shot']: for config in dataset_configs: diff --git a/opencompass/datasets/SeedBench.py b/opencompass/datasets/SeedBench.py index f7c97b5d..3e0d4902 100644 --- a/opencompass/datasets/SeedBench.py +++ b/opencompass/datasets/SeedBench.py @@ -1,27 +1,34 @@ -import os import random -import datasets -from typing import List -from .base import BaseDataset -from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator -import numpy as np import re +from typing import List + +import datasets import jieba +import numpy as np from rouge_chinese import Rouge + +from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, TEXT_POSTPROCESSORS +from .base import BaseDataset + class SeedBenchDataset(BaseDataset): + @staticmethod - def load(data_files: str, path: str = 'json', split: str = None, **kwargs) -> datasets.Dataset: + def load(data_files: str, + path: str = 'json', + split: str = None, + **kwargs) -> datasets.Dataset: dataset = datasets.load_dataset(path, data_files=data_files, **kwargs) if split is None: split = list(dataset.keys())[0] - print(f"my datasets split : {split}") + print(f'my datasets split : {split}') if split not in dataset: - raise ValueError(f"Split '{split}' not found. Available splits: {list(dataset.keys())}") + raise ValueError(f"Split '{split}' not found. \ + Available splits: {list(dataset.keys())}") return dataset[split] @@ -30,7 +37,8 @@ class F1Evaluator(BaseEvaluator): """F1 Score evaluator for multiple choice questions. Args: - seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + seed (int): Seed for randomness, ensuring reproducibility. + Defaults to 0. """ def __init__(self, seed: int = 0) -> None: @@ -38,53 +46,28 @@ class F1Evaluator(BaseEvaluator): super().__init__() def _preprocess(self, predictions: List, references: List) -> dict: - """Preprocess the final predictions and references to needed format. - - Args: - predictions (List): List of predictions for each sample. - references (List): List of reference answers for each sample. - - Returns: - dict: Preprocessed predictions and references in the required format. - """ return { 'predictions': predictions, 'references': references, } def _postprocess(self, scores: dict) -> dict: - """Postprocess the final score for F1. - - Args: - scores (dict): Dictionary of calculated F1 score. - - Returns: - dict: Postprocessed F1 score. - """ return scores def score(self, predictions: List, references: List) -> dict: - """Calculate F1 score. - - Args: - predictions (List): List of predicted answers for each sample. - references (List): List of reference answers for each sample. - - Returns: - dict: Calculated F1 score. - """ random_state = random.getstate() np_random_state = np.random.get_state() details = [] random.seed(self.seed) np.random.seed(self.seed) - + if len(predictions) != len(references): return { - 'error': 'predictions and references have different ' - f'length. len(predictions): {len(predictions)}, ' - f'len(references): {len(references)}' + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' } true_positives = 0 @@ -98,34 +81,46 @@ class F1Evaluator(BaseEvaluator): hyp_set = set(hyp.split(',')) ref_set = {r.strip() for r in ref_set} hyp_set = {h.strip() for h in hyp_set} - + sample_tp = len(hyp_set.intersection(ref_set)) sample_fp = len(hyp_set - ref_set) sample_fn = len(ref_set - hyp_set) true_positives += sample_tp false_positives += sample_fp false_negatives += sample_fn - sample_precision = sample_tp / (sample_tp + sample_fp) if (sample_tp + sample_fp) > 0 else 0 - sample_recall = sample_tp / (sample_tp + sample_fn) if (sample_tp + sample_fn) > 0 else 0 - sample_f1 = (2 * sample_precision * sample_recall) / (sample_precision + sample_recall) if (sample_precision + sample_recall) > 0 else 0 - details.append({'pred': hyp, 'answer': ref, 'correct': sample_f1 * 100}) + sample_precision = sample_tp / (sample_tp + sample_fp) if ( + sample_tp + sample_fp) > 0 else 0 + sample_recall = sample_tp / (sample_tp + sample_fn) if ( + sample_tp + sample_fn) > 0 else 0 + sample_f1 = (2 * sample_precision * sample_recall) / ( + sample_precision + sample_recall) if (sample_precision + + sample_recall) > 0 else 0 + details.append({ + 'pred': hyp, + 'answer': ref, + 'correct': sample_f1 * 100 + }) - precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0 - recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0 - f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 + precision = true_positives / (true_positives + false_positives) if ( + true_positives + false_positives) > 0 else 0 + recall = true_positives / (true_positives + false_negatives) if ( + true_positives + false_negatives) > 0 else 0 + f1 = (2 * precision * + recall) / (precision + recall) if (precision + recall) > 0 else 0 result = { - "ours_F1Score": f1 * 100, # 总体 F1 分数 - "details": details + 'ours_F1Score': f1 * 100, # 总体 F1 分数 + 'details': details } random.setstate(random_state) np.random.set_state(np_random_state) return self._postprocess(result) - + @ICL_EVALUATORS.register_module() class F1ScoreEvaluator(F1Evaluator): """F1 Score evaluator for multiple choice questions.""" + def __init__(self) -> None: super().__init__() @@ -143,7 +138,8 @@ class AverageRougeEvaluator(BaseEvaluator): """Average Rouge Score evaluator for fill-in-the-blank tasks. Args: - seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + seed (int): Seed for randomness, ensuring reproducibility. + Defaults to 0. """ def __init__(self, seed: int = 0) -> None: @@ -151,17 +147,11 @@ class AverageRougeEvaluator(BaseEvaluator): super().__init__() def _preprocess(self, predictions: List, references: List) -> dict: - """Preprocess the final predictions and references to needed format. - - Args: - predictions (List): List of predictions for each sample. - references (List): List of reference answers for each sample. - - Returns: - dict: Preprocessed predictions and references in the required format. - """ - pattern = r"(正确答案[::]|correct answer[::])" - cleaned_predictions = [re.sub(pattern, "", pred, flags=re.IGNORECASE).strip() for pred in predictions] + pattern = r'(正确答案[::]|correct answer[::])' + cleaned_predictions = [ + re.sub(pattern, '', pred, flags=re.IGNORECASE).strip() + for pred in predictions + ] return { 'predictions': cleaned_predictions, @@ -169,36 +159,20 @@ class AverageRougeEvaluator(BaseEvaluator): } def _postprocess(self, scores: dict) -> dict: - """Postprocess the final Rouge scores. - - Args: - scores (dict): Dictionary of calculated average Rouge scores. - - Returns: - dict: Postprocessed Rouge scores. - """ return scores def score(self, predictions: List, references: List) -> dict: - """Calculate average Rouge-L score. - Args: - predictions (List): List of predicted strings for each sample. - references (List): List of reference strings for each sample. - - Returns: - dict: Calculated average Rouge-L score. - """ def rouge_score(hyps, refs): - assert(len(hyps) == len(refs)) + assert (len(hyps) == len(refs)) hyps = [' '.join(jieba.cut(h)) for h in hyps] - hyps = [h if h.strip() != "" else "无内容" for h in hyps] + hyps = [h if h.strip() != '' else '无内容' for h in hyps] refs = [' '.join(jieba.cut(r)) for r in refs] rouge_scores = Rouge().get_scores(hyps, refs) - rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] + rouge_ls = [score['rouge-l']['f'] for score in rouge_scores] average_rouge_l = sum(rouge_ls) / len(rouge_ls) - return {"score": average_rouge_l * 100} - + return {'score': average_rouge_l * 100} + random_state = random.getstate() np_random_state = np.random.get_state() details = [] @@ -207,13 +181,15 @@ class AverageRougeEvaluator(BaseEvaluator): if len(predictions) != len(references): return { - 'error': 'predictions and references have different ' - f'length. len(predictions): {len(predictions)}, ' - f'len(references): {len(references)}' + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' } preprocessed_data = self._preprocess(predictions, references) - hyps, refs = preprocessed_data['predictions'], preprocessed_data['references'] + hyps, refs = preprocessed_data['predictions'], preprocessed_data[ + 'references'] scores = [] for i in range(len(hyps)): @@ -227,19 +203,22 @@ class AverageRougeEvaluator(BaseEvaluator): word_level_hyps = [h.strip() for h in word_level_hyps] if len(word_level_hyps) < len(word_level_refs): - word_level_hyps += ['无内容'] * (len(word_level_refs) - len(word_level_hyps)) + word_level_hyps += ['无内容'] * (len(word_level_refs) - + len(word_level_hyps)) else: word_level_hyps = word_level_hyps[:len(word_level_refs)] - sample_score = rouge_score(word_level_hyps, word_level_refs)["score"] + sample_score = rouge_score(word_level_hyps, + word_level_refs)['score'] scores.append(sample_score) - details.append({'pred': word_level_hyps, 'answer': word_level_refs, 'correct': sample_score}) + details.append({ + 'pred': word_level_hyps, + 'answer': word_level_refs, + 'correct': sample_score + }) average_score = sum(scores) / len(scores) - result = { - "AvgRougeScore": average_score, - "details": details - } + result = {'AvgRougeScore': average_score, 'details': details} random.setstate(random_state) np.random.set_state(np_random_state) @@ -258,7 +237,8 @@ class AccScoreStrEvaluator(BaseEvaluator): """Accuracy evaluator based on string matching. Args: - seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + seed (int): Seed for randomness, ensuring reproducibility. + Defaults to 0. """ def __init__(self, seed: int = 0) -> None: @@ -266,67 +246,40 @@ class AccScoreStrEvaluator(BaseEvaluator): super().__init__() def _preprocess(self, predictions: List, references: List) -> dict: - """Preprocess the final predictions and references to needed format. - - Args: - predictions (List): List of predictions for each sample. - references (List): List of reference answers for each sample. - - Returns: - dict: Preprocessed predictions and references in the required format. - """ return { 'predictions': predictions, 'references': references, } def _postprocess(self, scores: dict) -> dict: - """Postprocess the final accuracy score. - - Args: - scores (dict): Dictionary of calculated accuracy score. - - Returns: - dict: Postprocessed accuracy score. - """ return scores def score(self, predictions: List, references: List) -> dict: - """Calculate accuracy score. - - Args: - predictions (List): List of predicted strings for each sample. - references (List): List of reference strings for each sample. - - Returns: - dict: Calculated accuracy score. - """ random_state = random.getstate() np_random_state = np.random.get_state() details = [] random.seed(self.seed) np.random.seed(self.seed) - + if len(predictions) != len(references): return { - 'error': 'predictions and references have different ' - f'length. len(predictions): {len(predictions)}, ' - f'len(references): {len(references)}' + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' } preprocessed_data = self._preprocess(predictions, references) correct = 0 - for hyp, ref in zip(preprocessed_data['predictions'], preprocessed_data['references']): + for hyp, ref in zip(preprocessed_data['predictions'], + preprocessed_data['references']): is_correct = 1 if ref.strip().lower() in hyp.strip().lower() else 0 correct += is_correct details.append({'pred': hyp, 'answer': ref, 'correct': is_correct}) accuracy = correct / len(predictions) - result = { - "ACCStrScore": accuracy * 100, - "details": details - } + result = {'ACCStrScore': accuracy * 100, 'details': details} random.setstate(random_state) np.random.set_state(np_random_state) From c9ea024c673c5e5541ec0038cfa54f9398c7337c Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Tue, 15 Apr 2025 03:15:00 +0800 Subject: [PATCH 4/6] fix: fix load function for SeedBenchDataset --- .../datasets/SeedBench/seedbench_gen.py | 2 +- .../SeedBench/seedbench_gen_5d5ea1.py | 6 ++--- opencompass/datasets/SeedBench.py | 23 +++++++++++++++---- opencompass/utils/datasets_info.py | 2 +- 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/opencompass/configs/datasets/SeedBench/seedbench_gen.py b/opencompass/configs/datasets/SeedBench/seedbench_gen.py index 0714869a..9b9f7ac6 100644 --- a/opencompass/configs/datasets/SeedBench/seedbench_gen.py +++ b/opencompass/configs/datasets/SeedBench/seedbench_gen.py @@ -2,4 +2,4 @@ from mmengine.config import read_base with read_base(): # Default use LLM as a judge - from .seedbench_gen_44868b import seedbench_datasets # noqa: F401, F403 + from .seedbench_gen_5d5ea1 import seedbench_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py b/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py index 07661901..55babe53 100644 --- a/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py +++ b/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py @@ -6,8 +6,6 @@ from opencompass.datasets.SeedBench import SeedBenchDataset, F1ScoreEvaluator, m from opencompass.utils.text_postprocessors import first_option_postprocess -agri_data_dir = './data/SeedBench' - agri_reader_cfg = dict( input_columns=['instruction', 'question'], output_column='answer' @@ -61,14 +59,14 @@ for stage in ['zero-shot','one-shot']: ) if 'pred_postprocessor' in config: eval_cfg['pred_postprocessor'] = config['pred_postprocessor'] - data_file = f"{agri_data_dir}/{stage}/{config['data_file']}" + data_file = f"{stage}/{config['data_file']}" abbr_name = f"{config['abbr']}_{stage}" seedbench_datasets.append( dict( type=SeedBenchDataset, abbr=abbr_name, data_files=data_file, - path='json', + path='opencompass/seedbench', reader_cfg=agri_reader_cfg, infer_cfg=agri_infer_cfg, eval_cfg=eval_cfg diff --git a/opencompass/datasets/SeedBench.py b/opencompass/datasets/SeedBench.py index 3e0d4902..7b88ea05 100644 --- a/opencompass/datasets/SeedBench.py +++ b/opencompass/datasets/SeedBench.py @@ -1,5 +1,6 @@ import random import re +from os import environ from typing import List import datasets @@ -8,23 +9,37 @@ import numpy as np from rouge_chinese import Rouge from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, TEXT_POSTPROCESSORS +from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET, + TEXT_POSTPROCESSORS) +from opencompass.utils import get_data_path from .base import BaseDataset +@LOAD_DATASET.register_module() class SeedBenchDataset(BaseDataset): @staticmethod def load(data_files: str, - path: str = 'json', + path: str, split: str = None, **kwargs) -> datasets.Dataset: - dataset = datasets.load_dataset(path, data_files=data_files, **kwargs) + + path = get_data_path(path) + if environ.get('DATASET_SOURCE', None) == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path, + subset_name='default', + split=split, + data_files=data_files, + **kwargs) + else: + dataset = datasets.load_dataset(path, + data_files=data_files, + **kwargs) if split is None: split = list(dataset.keys())[0] - print(f'my datasets split : {split}') if split not in dataset: raise ValueError(f"Split '{split}' not found. \ diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 0cdc8624..d8418655 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -231,7 +231,7 @@ DATASETS_MAPPING = { }, # SeedBench "opencompass/seedbench": { - "ms_id": "", + "ms_id": "y12869741/SeedBench", "hf_id": "y12869741/SeedBench", "local": "./data/SeedBench", }, From 39b34d64880366b6bf7e6a67a2ce38a0192a32a7 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Tue, 15 Apr 2025 14:26:27 +0800 Subject: [PATCH 5/6] fix: delete unnecessary code --- opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py | 2 +- opencompass/datasets/SeedBench.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py b/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py index 55babe53..8132f4d6 100644 --- a/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py +++ b/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator, JiebaRougeEvaluator, RougeEvaluator -from opencompass.datasets.SeedBench import SeedBenchDataset, F1ScoreEvaluator, my_multiple_select_postprocess, AverageRougeScoreEvaluator +from opencompass.datasets import SeedBenchDataset, F1ScoreEvaluator, my_multiple_select_postprocess, AverageRougeScoreEvaluator from opencompass.utils.text_postprocessors import first_option_postprocess diff --git a/opencompass/datasets/SeedBench.py b/opencompass/datasets/SeedBench.py index 7b88ea05..2f4e0b33 100644 --- a/opencompass/datasets/SeedBench.py +++ b/opencompass/datasets/SeedBench.py @@ -26,7 +26,7 @@ class SeedBenchDataset(BaseDataset): **kwargs) -> datasets.Dataset: path = get_data_path(path) - if environ.get('DATASET_SOURCE', None) == 'ModelScope': + if environ.get('DATASET_SOURCE') == 'ModelScope': from modelscope import MsDataset dataset = MsDataset.load(path, subset_name='default', From e335b29e1224e0212ec1a80f4b41bf28bf79efd6 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Tue, 15 Apr 2025 15:34:06 +0800 Subject: [PATCH 6/6] fix: fix typo --- opencompass/datasets/SeedBench.py | 2 +- opencompass/utils/datasets_info.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/opencompass/datasets/SeedBench.py b/opencompass/datasets/SeedBench.py index 2f4e0b33..3ec1ec23 100644 --- a/opencompass/datasets/SeedBench.py +++ b/opencompass/datasets/SeedBench.py @@ -124,7 +124,7 @@ class F1Evaluator(BaseEvaluator): recall) / (precision + recall) if (precision + recall) > 0 else 0 result = { - 'ours_F1Score': f1 * 100, # 总体 F1 分数 + 'F1Score': f1 * 100, # 总体 F1 分数 'details': details } random.setstate(random_state) diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index d8418655..b1b2a262 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -232,7 +232,7 @@ DATASETS_MAPPING = { # SeedBench "opencompass/seedbench": { "ms_id": "y12869741/SeedBench", - "hf_id": "y12869741/SeedBench", + "hf_id": "yj12869741/SeedBench", "local": "./data/SeedBench", }, # SIQA