This commit is contained in:
chenzihong 2025-05-12 12:58:47 +08:00 committed by GitHub
commit e693952b32
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 479 additions and 0 deletions

View File

@ -859,6 +859,11 @@
paper: https://arxiv.org/pdf/2407.13168
configpath: opencompass/configs/datasets/scicode/scicode_gen.py
configpath_llmjudge: ''
- seedbench:
name: SeedBench
category: Knowledge
paper: ''
configpath: opencompass/configs/datasets/SeedBench/seedbench_gen.py
- simpleqa:
name: SimpleQA
category: Knowledge

View File

@ -0,0 +1,79 @@
## 🌾 About SeedBench
**SeedBench** is the first multi-task benchmark designed to evaluate large language models (LLMs) in seed science, focusing on seed breeding. This repository includes the dataset, evaluation code, and documentation to support research in this domain.
SeedBench assesses LLMs across three core seed breeding stages:
- **Gene Information Retrieval**
- **Gene Function and Regulation Analysis**
- **Variety Breeding with Agronomic Trait Optimization**
Built with domain experts, SeedBench features **2,264 expert-validated questions** across 11 task types and 10 subcategories, initially targeting rice breeding. Future updates will include other crops like maize, soybean, and wheat.
---
## 🔎 Dataset Details
- **Corpus**: 308,727 publications cleaned to 1.1 billion tokens; 279 segments from 113 documents.
- **Questions**: 2,264 across 11 task types, bilingual (English/Chinese), expert-validated.
- **Focus**: Rice breeding as a representative case.
**Types and metrics:**
<div align="center">
| Type ID | Question Type | Metric | Count |
|---------|----------------------------|----------|-------|
| **Q&A** | | | |
| QA-1 | Multiple Choice | Accuracy | 200 |
| QA-2 | Multiple Answer | Macro-F1 | 187 |
| QA-3 | Fill-in-the-Blank | ROUGE-L | 224 |
| QA-4 | Generation | ROUGE-L | 242 |
| **Summarization** | | | |
| SUM-1 | Simple Summarization | ROUGE-L | 225 |
| SUM-2 | Key Information Extraction | ROUGE-L | 225 |
| **Reading Comprehension** | | | |
| RC-1 | Multiple Choice | Accuracy | 113 |
| RC-2 | Multiple Answer | Macro-F1 | 108 |
| RC-3 | Fill-in-the-Blank | ROUGE-L | 221 |
| RC-4 | Generation | ROUGE-L | 240 |
| RC-5 | Subcategory Classification | Accuracy | 279 |
</div>
---
## 📂 Dataset Links
- [SeedBench on Github](https://github.com/open-sciencelab/SeedBench)
- [SeedBench on Hugging Face](https://huggingface.co/datasets/yj12869741/SeedBench)
- [SeedBench on ModelScope](https://www.modelscope.cn/datasets/y12869741/SeedBench/summary)
---
## ☀️ Key Results
We evaluated 26 LLMs, including proprietary, open-source, and domain-specific models. Highlights:
### Performance by Question Type
- **Top Performers**: DeepSeek-V3 (68.37), GPT-4 (67.88).
### Performance by Task Types
| Model | QA-1 | QA-2 | QA-3 | QA-4 | SUM-1 | SUM-2 | RC-1 | RC-2 | RC-3 | RC-4 | RC-5 | Avg |
|------------------|------|------|------|------|-------|-------|------|------|------|------|------|------|
| GPT-4 | 60.50| 73.87| 21.35| 36.07| 58.73 | 62.89 | 100.00| 96.44| 87.86| 62.29| 86.74| 67.88|
| DeepSeek-V3 | 72.50| 79.84| 29.29| 40.63| 48.06 | 54.67 | 100.00| 97.22| 87.89| 55.19| 86.74| 68.37|
| Qwen2-72B | 59.50| 75.98| 19.55| 31.62| 31.08 | 63.09 | 99.12 | 94.24| 72.20| 51.58| 89.96| 62.54|
### Performance by Subcategory
| Model | C1 | C2 | C3 | C4 | C5 | C6 | C7 | C8 | C9 | C10 | Avg |
|-------------------|------|------|------|------|------|------|------|------|------|------|------|
| GPT-4 | 59.59| 60.55| 76.32| 61.16| 56.34| 59.35| 63.67| 64.74| 60.65| 67.66| 62.06|
| DeepSeek-V3-671B | 56.03| 62.42| 74.81| 63.17| 55.23| 58.84| 68.23| 69.04| 66.46| 68.48| 63.30|
| Qwen2-72B | 51.16| 58.10| 74.07| 59.72| 51.58| 57.76| 58.85| 61.63| 56.69| 59.11| 57.62|
- **Top Performers**: DeepSeek-V3-671B (63.30), GPT-4 (62.06).

View File

@ -0,0 +1,5 @@
from mmengine.config import read_base
with read_base():
# Default use LLM as a judge
from .seedbench_gen_5d5ea1 import seedbench_datasets # noqa: F401, F403

View File

@ -0,0 +1,74 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator, JiebaRougeEvaluator, RougeEvaluator
from opencompass.datasets import SeedBenchDataset, F1ScoreEvaluator, my_multiple_select_postprocess, AverageRougeScoreEvaluator
from opencompass.utils.text_postprocessors import first_option_postprocess
agri_reader_cfg = dict(
input_columns=['instruction', 'question'],
output_column='answer'
)
agri_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='{instruction}\n{question}\n'
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
default_dataset_cfg = {
'type': SeedBenchDataset,
'path': 'json',
'reader_cfg': agri_reader_cfg,
'infer_cfg': agri_infer_cfg,
}
dataset_configs = [
# 1-n
{'abbr': 'seedbench_1-1', 'data_file': '1-1.json', 'evaluator': 'AccEvaluator',
'pred_postprocessor': dict(type=first_option_postprocess, options='ABCD')},
{'abbr': 'seedbench_1-2', 'data_file': '1-2.json', 'evaluator': 'F1ScoreEvaluator',
'pred_postprocessor': dict(type=my_multiple_select_postprocess)},
# {'abbr': 'seedbench_1-3_em', 'data_file': '1-3.json', 'evaluator': 'ExactMatchScoreEvaluator'},
{'abbr': 'seedbench_1-3', 'data_file': '1-3.json', 'evaluator': 'AverageRougeScoreEvaluator'},
{'abbr': 'seedbench_1-4', 'data_file': '1-4.json', 'evaluator': 'RougeEvaluator'},
# # 2-n
{'abbr': 'seedbench_2-1', 'data_file': '2-1.json', 'evaluator': 'RougeEvaluator'},
{'abbr': 'seedbench_2-2', 'data_file': '2-2.json', 'evaluator': 'RougeEvaluator'},
# 3-n
{'abbr': 'seedbench_3-1', 'data_file': '3-1.json', 'evaluator': 'AccEvaluator',
'pred_postprocessor': dict(type=first_option_postprocess, options='ABCD')},
{'abbr': 'seedbench_3-2', 'data_file': '3-2.json', 'evaluator': 'F1ScoreEvaluator',
'pred_postprocessor': dict(type=my_multiple_select_postprocess)},
# {'abbr': 'seedbench_3-3_em', 'data_file': '3-3.json', 'evaluator': 'ExactMatchScoreEvaluator'},
{'abbr': 'seedbench_3-3', 'data_file': '3-3.json', 'evaluator': 'AverageRougeScoreEvaluator'},
{'abbr': 'seedbench_3-4', 'data_file': '3-4.json', 'evaluator': 'RougeEvaluator'},
{'abbr': 'seedbench_3-5', 'data_file': '3-5.json', 'evaluator': 'AccScoreStr_Evaluator'},
]
seedbench_datasets = []
for stage in ['zero-shot','one-shot']:
for config in dataset_configs:
eval_cfg = dict(
evaluator=dict(type=config['evaluator'])
)
if 'pred_postprocessor' in config:
eval_cfg['pred_postprocessor'] = config['pred_postprocessor']
data_file = f"{stage}/{config['data_file']}"
abbr_name = f"{config['abbr']}_{stage}"
seedbench_datasets.append(
dict(
type=SeedBenchDataset,
abbr=abbr_name,
data_files=data_file,
path='opencompass/seedbench',
reader_cfg=agri_reader_cfg,
infer_cfg=agri_infer_cfg,
eval_cfg=eval_cfg
)
)

View File

@ -0,0 +1,309 @@
import random
import re
from os import environ
from typing import List
import datasets
import jieba
import numpy as np
from rouge_chinese import Rouge
from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
TEXT_POSTPROCESSORS)
from opencompass.utils import get_data_path
from .base import BaseDataset
@LOAD_DATASET.register_module()
class SeedBenchDataset(BaseDataset):
@staticmethod
def load(data_files: str,
path: str,
split: str = None,
**kwargs) -> datasets.Dataset:
path = get_data_path(path)
if environ.get('DATASET_SOURCE') == 'ModelScope':
from modelscope import MsDataset
dataset = MsDataset.load(path,
subset_name='default',
split=split,
data_files=data_files,
**kwargs)
else:
dataset = datasets.load_dataset(path,
data_files=data_files,
**kwargs)
if split is None:
split = list(dataset.keys())[0]
if split not in dataset:
raise ValueError(f"Split '{split}' not found. \
Available splits: {list(dataset.keys())}")
return dataset[split]
class F1Evaluator(BaseEvaluator):
"""F1 Score evaluator for multiple choice questions.
Args:
seed (int): Seed for randomness, ensuring reproducibility.
Defaults to 0.
"""
def __init__(self, seed: int = 0) -> None:
self.seed = seed
super().__init__()
def _preprocess(self, predictions: List, references: List) -> dict:
return {
'predictions': predictions,
'references': references,
}
def _postprocess(self, scores: dict) -> dict:
return scores
def score(self, predictions: List, references: List) -> dict:
random_state = random.getstate()
np_random_state = np.random.get_state()
details = []
random.seed(self.seed)
np.random.seed(self.seed)
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
true_positives = 0
false_positives = 0
false_negatives = 0
for hyp, ref in zip(predictions, references):
hyp = re.sub(r'[^A-Da-d,]+', '', hyp.lower())
ref = re.sub(r'[^A-Da-d,]+', '', ref.lower())
ref_set = set(ref.split(','))
hyp_set = set(hyp.split(','))
ref_set = {r.strip() for r in ref_set}
hyp_set = {h.strip() for h in hyp_set}
sample_tp = len(hyp_set.intersection(ref_set))
sample_fp = len(hyp_set - ref_set)
sample_fn = len(ref_set - hyp_set)
true_positives += sample_tp
false_positives += sample_fp
false_negatives += sample_fn
sample_precision = sample_tp / (sample_tp + sample_fp) if (
sample_tp + sample_fp) > 0 else 0
sample_recall = sample_tp / (sample_tp + sample_fn) if (
sample_tp + sample_fn) > 0 else 0
sample_f1 = (2 * sample_precision * sample_recall) / (
sample_precision + sample_recall) if (sample_precision +
sample_recall) > 0 else 0
details.append({
'pred': hyp,
'answer': ref,
'correct': sample_f1 * 100
})
precision = true_positives / (true_positives + false_positives) if (
true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (
true_positives + false_negatives) > 0 else 0
f1 = (2 * precision *
recall) / (precision + recall) if (precision + recall) > 0 else 0
result = {
'F1Score': f1 * 100, # 总体 F1 分数
'details': details
}
random.setstate(random_state)
np.random.set_state(np_random_state)
return self._postprocess(result)
@ICL_EVALUATORS.register_module()
class F1ScoreEvaluator(F1Evaluator):
"""F1 Score evaluator for multiple choice questions."""
def __init__(self) -> None:
super().__init__()
# 定义自己的多选后处理逻辑输入回答为ABC ---> A,B,C)
@TEXT_POSTPROCESSORS.register_module('my_multiple_select_postprocess')
def my_multiple_select_postprocess(text: str) -> str:
selected_options = [t for t in text if t.isupper()]
selected_options = sorted(set(selected_options))
res = ', '.join(selected_options)
return res
class AverageRougeEvaluator(BaseEvaluator):
"""Average Rouge Score evaluator for fill-in-the-blank tasks.
Args:
seed (int): Seed for randomness, ensuring reproducibility.
Defaults to 0.
"""
def __init__(self, seed: int = 0) -> None:
self.seed = seed
super().__init__()
def _preprocess(self, predictions: List, references: List) -> dict:
pattern = r'(正确答案[:]|correct answer[:])'
cleaned_predictions = [
re.sub(pattern, '', pred, flags=re.IGNORECASE).strip()
for pred in predictions
]
return {
'predictions': cleaned_predictions,
'references': references,
}
def _postprocess(self, scores: dict) -> dict:
return scores
def score(self, predictions: List, references: List) -> dict:
def rouge_score(hyps, refs):
assert (len(hyps) == len(refs))
hyps = [' '.join(jieba.cut(h)) for h in hyps]
hyps = [h if h.strip() != '' else '无内容' for h in hyps]
refs = [' '.join(jieba.cut(r)) for r in refs]
rouge_scores = Rouge().get_scores(hyps, refs)
rouge_ls = [score['rouge-l']['f'] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {'score': average_rouge_l * 100}
random_state = random.getstate()
np_random_state = np.random.get_state()
details = []
random.seed(self.seed)
np.random.seed(self.seed)
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
preprocessed_data = self._preprocess(predictions, references)
hyps, refs = preprocessed_data['predictions'], preprocessed_data[
'references']
scores = []
for i in range(len(hyps)):
refs[i] = refs[i].replace('', ',')
word_level_refs = refs[i].split(',')
word_level_refs = [r.strip() for r in word_level_refs]
if len(word_level_refs) == 1:
word_level_hyps = [hyps[i]]
else:
word_level_hyps = hyps[i].split(',')
word_level_hyps = [h.strip() for h in word_level_hyps]
if len(word_level_hyps) < len(word_level_refs):
word_level_hyps += ['无内容'] * (len(word_level_refs) -
len(word_level_hyps))
else:
word_level_hyps = word_level_hyps[:len(word_level_refs)]
sample_score = rouge_score(word_level_hyps,
word_level_refs)['score']
scores.append(sample_score)
details.append({
'pred': word_level_hyps,
'answer': word_level_refs,
'correct': sample_score
})
average_score = sum(scores) / len(scores)
result = {'AvgRougeScore': average_score, 'details': details}
random.setstate(random_state)
np.random.set_state(np_random_state)
return self._postprocess(result)
@ICL_EVALUATORS.register_module()
class AverageRougeScoreEvaluator(AverageRougeEvaluator):
"""Average Rouge Score evaluator."""
def __init__(self) -> None:
super().__init__()
class AccScoreStrEvaluator(BaseEvaluator):
"""Accuracy evaluator based on string matching.
Args:
seed (int): Seed for randomness, ensuring reproducibility.
Defaults to 0.
"""
def __init__(self, seed: int = 0) -> None:
self.seed = seed
super().__init__()
def _preprocess(self, predictions: List, references: List) -> dict:
return {
'predictions': predictions,
'references': references,
}
def _postprocess(self, scores: dict) -> dict:
return scores
def score(self, predictions: List, references: List) -> dict:
random_state = random.getstate()
np_random_state = np.random.get_state()
details = []
random.seed(self.seed)
np.random.seed(self.seed)
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
preprocessed_data = self._preprocess(predictions, references)
correct = 0
for hyp, ref in zip(preprocessed_data['predictions'],
preprocessed_data['references']):
is_correct = 1 if ref.strip().lower() in hyp.strip().lower() else 0
correct += is_correct
details.append({'pred': hyp, 'answer': ref, 'correct': is_correct})
accuracy = correct / len(predictions)
result = {'ACCStrScore': accuracy * 100, 'details': details}
random.setstate(random_state)
np.random.set_state(np_random_state)
return self._postprocess(result)
@ICL_EVALUATORS.register_module()
class AccScoreStr_Evaluator(AccScoreStrEvaluator):
"""Accuracy evaluator wrapper for the AccScoreEvaluator."""
def __init__(self) -> None:
super().__init__()

View File

@ -134,6 +134,7 @@ from .ruler import * # noqa: F401, F403
from .safety import * # noqa: F401, F403
from .scibench import ScibenchDataset, scibench_postprocess # noqa: F401, F403
from .scicode import * # noqa: F401, F403
from .SeedBench import * # noqa: F401, F403
from .simpleqa import * # noqa: F401, F403
from .siqa import * # noqa: F401, F403
from .smolinstruct import * # noqa: F401, F403

View File

@ -235,6 +235,12 @@ DATASETS_MAPPING = {
"hf_id": "opencompass/race",
"local": "./data/race/",
},
# SeedBench
"opencompass/seedbench": {
"ms_id": "y12869741/SeedBench",
"hf_id": "yj12869741/SeedBench",
"local": "./data/SeedBench",
},
# SIQA
"opencompass/siqa": {
"ms_id": "opencompass/siqa",