2025-04-14 14:23:29 +08:00
|
|
|
|
import random
|
|
|
|
|
import re
|
2025-04-15 03:15:00 +08:00
|
|
|
|
from os import environ
|
2025-04-14 21:20:29 +08:00
|
|
|
|
from typing import List
|
|
|
|
|
|
|
|
|
|
import datasets
|
2025-04-14 14:23:29 +08:00
|
|
|
|
import jieba
|
2025-04-14 21:20:29 +08:00
|
|
|
|
import numpy as np
|
2025-04-14 14:23:29 +08:00
|
|
|
|
from rouge_chinese import Rouge
|
2025-04-14 21:20:29 +08:00
|
|
|
|
|
|
|
|
|
from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
|
2025-04-15 03:15:00 +08:00
|
|
|
|
from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
|
|
|
|
|
TEXT_POSTPROCESSORS)
|
|
|
|
|
from opencompass.utils import get_data_path
|
2025-04-14 14:23:29 +08:00
|
|
|
|
|
2025-04-14 21:20:29 +08:00
|
|
|
|
from .base import BaseDataset
|
|
|
|
|
|
2025-04-14 14:23:29 +08:00
|
|
|
|
|
2025-04-15 03:15:00 +08:00
|
|
|
|
@LOAD_DATASET.register_module()
|
2025-04-14 14:23:29 +08:00
|
|
|
|
class SeedBenchDataset(BaseDataset):
|
2025-04-14 21:20:29 +08:00
|
|
|
|
|
2025-04-14 14:23:29 +08:00
|
|
|
|
@staticmethod
|
2025-04-14 21:20:29 +08:00
|
|
|
|
def load(data_files: str,
|
2025-04-15 03:15:00 +08:00
|
|
|
|
path: str,
|
2025-04-14 21:20:29 +08:00
|
|
|
|
split: str = None,
|
|
|
|
|
**kwargs) -> datasets.Dataset:
|
2025-04-15 03:15:00 +08:00
|
|
|
|
|
|
|
|
|
path = get_data_path(path)
|
|
|
|
|
if environ.get('DATASET_SOURCE', None) == 'ModelScope':
|
|
|
|
|
from modelscope import MsDataset
|
|
|
|
|
dataset = MsDataset.load(path,
|
|
|
|
|
subset_name='default',
|
|
|
|
|
split=split,
|
|
|
|
|
data_files=data_files,
|
|
|
|
|
**kwargs)
|
|
|
|
|
else:
|
|
|
|
|
dataset = datasets.load_dataset(path,
|
|
|
|
|
data_files=data_files,
|
|
|
|
|
**kwargs)
|
2025-04-14 14:23:29 +08:00
|
|
|
|
|
|
|
|
|
if split is None:
|
|
|
|
|
split = list(dataset.keys())[0]
|
|
|
|
|
|
|
|
|
|
if split not in dataset:
|
2025-04-14 21:20:29 +08:00
|
|
|
|
raise ValueError(f"Split '{split}' not found. \
|
|
|
|
|
Available splits: {list(dataset.keys())}")
|
2025-04-14 14:23:29 +08:00
|
|
|
|
|
|
|
|
|
return dataset[split]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class F1Evaluator(BaseEvaluator):
|
|
|
|
|
"""F1 Score evaluator for multiple choice questions.
|
|
|
|
|
|
|
|
|
|
Args:
|
2025-04-14 21:20:29 +08:00
|
|
|
|
seed (int): Seed for randomness, ensuring reproducibility.
|
|
|
|
|
Defaults to 0.
|
2025-04-14 14:23:29 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, seed: int = 0) -> None:
|
|
|
|
|
self.seed = seed
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
|
|
|
return {
|
|
|
|
|
'predictions': predictions,
|
|
|
|
|
'references': references,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
|
|
|
return scores
|
|
|
|
|
|
|
|
|
|
def score(self, predictions: List, references: List) -> dict:
|
|
|
|
|
random_state = random.getstate()
|
|
|
|
|
np_random_state = np.random.get_state()
|
|
|
|
|
details = []
|
|
|
|
|
|
|
|
|
|
random.seed(self.seed)
|
|
|
|
|
np.random.seed(self.seed)
|
2025-04-14 21:20:29 +08:00
|
|
|
|
|
2025-04-14 14:23:29 +08:00
|
|
|
|
if len(predictions) != len(references):
|
|
|
|
|
return {
|
2025-04-14 21:20:29 +08:00
|
|
|
|
'error':
|
|
|
|
|
'predictions and references have different '
|
|
|
|
|
f'length. len(predictions): {len(predictions)}, '
|
|
|
|
|
f'len(references): {len(references)}'
|
2025-04-14 14:23:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
true_positives = 0
|
|
|
|
|
false_positives = 0
|
|
|
|
|
false_negatives = 0
|
|
|
|
|
|
|
|
|
|
for hyp, ref in zip(predictions, references):
|
|
|
|
|
hyp = re.sub(r'[^A-Da-d,]+', '', hyp.lower())
|
|
|
|
|
ref = re.sub(r'[^A-Da-d,]+', '', ref.lower())
|
|
|
|
|
ref_set = set(ref.split(','))
|
|
|
|
|
hyp_set = set(hyp.split(','))
|
|
|
|
|
ref_set = {r.strip() for r in ref_set}
|
|
|
|
|
hyp_set = {h.strip() for h in hyp_set}
|
2025-04-14 21:20:29 +08:00
|
|
|
|
|
2025-04-14 14:23:29 +08:00
|
|
|
|
sample_tp = len(hyp_set.intersection(ref_set))
|
|
|
|
|
sample_fp = len(hyp_set - ref_set)
|
|
|
|
|
sample_fn = len(ref_set - hyp_set)
|
|
|
|
|
true_positives += sample_tp
|
|
|
|
|
false_positives += sample_fp
|
|
|
|
|
false_negatives += sample_fn
|
2025-04-14 21:20:29 +08:00
|
|
|
|
sample_precision = sample_tp / (sample_tp + sample_fp) if (
|
|
|
|
|
sample_tp + sample_fp) > 0 else 0
|
|
|
|
|
sample_recall = sample_tp / (sample_tp + sample_fn) if (
|
|
|
|
|
sample_tp + sample_fn) > 0 else 0
|
|
|
|
|
sample_f1 = (2 * sample_precision * sample_recall) / (
|
|
|
|
|
sample_precision + sample_recall) if (sample_precision +
|
|
|
|
|
sample_recall) > 0 else 0
|
|
|
|
|
details.append({
|
|
|
|
|
'pred': hyp,
|
|
|
|
|
'answer': ref,
|
|
|
|
|
'correct': sample_f1 * 100
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
precision = true_positives / (true_positives + false_positives) if (
|
|
|
|
|
true_positives + false_positives) > 0 else 0
|
|
|
|
|
recall = true_positives / (true_positives + false_negatives) if (
|
|
|
|
|
true_positives + false_negatives) > 0 else 0
|
|
|
|
|
f1 = (2 * precision *
|
|
|
|
|
recall) / (precision + recall) if (precision + recall) > 0 else 0
|
2025-04-14 14:23:29 +08:00
|
|
|
|
|
|
|
|
|
result = {
|
2025-04-14 21:20:29 +08:00
|
|
|
|
'ours_F1Score': f1 * 100, # 总体 F1 分数
|
|
|
|
|
'details': details
|
2025-04-14 14:23:29 +08:00
|
|
|
|
}
|
|
|
|
|
random.setstate(random_state)
|
|
|
|
|
np.random.set_state(np_random_state)
|
|
|
|
|
return self._postprocess(result)
|
2025-04-14 19:51:01 +08:00
|
|
|
|
|
2025-04-14 21:20:29 +08:00
|
|
|
|
|
2025-04-14 14:23:29 +08:00
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
|
|
|
class F1ScoreEvaluator(F1Evaluator):
|
|
|
|
|
"""F1 Score evaluator for multiple choice questions."""
|
2025-04-14 21:20:29 +08:00
|
|
|
|
|
2025-04-14 14:23:29 +08:00
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 定义自己的多选后处理逻辑(输入回答为:ABC ---> A,B,C)
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('my_multiple_select_postprocess')
|
|
|
|
|
def my_multiple_select_postprocess(text: str) -> str:
|
|
|
|
|
selected_options = [t for t in text if t.isupper()]
|
|
|
|
|
selected_options = sorted(set(selected_options))
|
|
|
|
|
res = ', '.join(selected_options)
|
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AverageRougeEvaluator(BaseEvaluator):
|
|
|
|
|
"""Average Rouge Score evaluator for fill-in-the-blank tasks.
|
|
|
|
|
|
|
|
|
|
Args:
|
2025-04-14 21:20:29 +08:00
|
|
|
|
seed (int): Seed for randomness, ensuring reproducibility.
|
|
|
|
|
Defaults to 0.
|
2025-04-14 14:23:29 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, seed: int = 0) -> None:
|
|
|
|
|
self.seed = seed
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
2025-04-14 21:20:29 +08:00
|
|
|
|
pattern = r'(正确答案[::]|correct answer[::])'
|
|
|
|
|
cleaned_predictions = [
|
|
|
|
|
re.sub(pattern, '', pred, flags=re.IGNORECASE).strip()
|
|
|
|
|
for pred in predictions
|
|
|
|
|
]
|
2025-04-14 14:23:29 +08:00
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
'predictions': cleaned_predictions,
|
|
|
|
|
'references': references,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
|
|
|
return scores
|
|
|
|
|
|
|
|
|
|
def score(self, predictions: List, references: List) -> dict:
|
|
|
|
|
|
|
|
|
|
def rouge_score(hyps, refs):
|
2025-04-14 21:20:29 +08:00
|
|
|
|
assert (len(hyps) == len(refs))
|
2025-04-14 14:23:29 +08:00
|
|
|
|
hyps = [' '.join(jieba.cut(h)) for h in hyps]
|
2025-04-14 21:20:29 +08:00
|
|
|
|
hyps = [h if h.strip() != '' else '无内容' for h in hyps]
|
2025-04-14 14:23:29 +08:00
|
|
|
|
refs = [' '.join(jieba.cut(r)) for r in refs]
|
|
|
|
|
rouge_scores = Rouge().get_scores(hyps, refs)
|
2025-04-14 21:20:29 +08:00
|
|
|
|
rouge_ls = [score['rouge-l']['f'] for score in rouge_scores]
|
2025-04-14 14:23:29 +08:00
|
|
|
|
average_rouge_l = sum(rouge_ls) / len(rouge_ls)
|
2025-04-14 21:20:29 +08:00
|
|
|
|
return {'score': average_rouge_l * 100}
|
|
|
|
|
|
2025-04-14 14:23:29 +08:00
|
|
|
|
random_state = random.getstate()
|
|
|
|
|
np_random_state = np.random.get_state()
|
|
|
|
|
details = []
|
|
|
|
|
random.seed(self.seed)
|
|
|
|
|
np.random.seed(self.seed)
|
|
|
|
|
|
|
|
|
|
if len(predictions) != len(references):
|
|
|
|
|
return {
|
2025-04-14 21:20:29 +08:00
|
|
|
|
'error':
|
|
|
|
|
'predictions and references have different '
|
|
|
|
|
f'length. len(predictions): {len(predictions)}, '
|
|
|
|
|
f'len(references): {len(references)}'
|
2025-04-14 14:23:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
preprocessed_data = self._preprocess(predictions, references)
|
2025-04-14 21:20:29 +08:00
|
|
|
|
hyps, refs = preprocessed_data['predictions'], preprocessed_data[
|
|
|
|
|
'references']
|
2025-04-14 14:23:29 +08:00
|
|
|
|
|
|
|
|
|
scores = []
|
|
|
|
|
for i in range(len(hyps)):
|
|
|
|
|
refs[i] = refs[i].replace(',', ',')
|
|
|
|
|
word_level_refs = refs[i].split(',')
|
|
|
|
|
word_level_refs = [r.strip() for r in word_level_refs]
|
|
|
|
|
if len(word_level_refs) == 1:
|
|
|
|
|
word_level_hyps = [hyps[i]]
|
|
|
|
|
else:
|
|
|
|
|
word_level_hyps = hyps[i].split(',')
|
|
|
|
|
word_level_hyps = [h.strip() for h in word_level_hyps]
|
|
|
|
|
|
|
|
|
|
if len(word_level_hyps) < len(word_level_refs):
|
2025-04-14 21:20:29 +08:00
|
|
|
|
word_level_hyps += ['无内容'] * (len(word_level_refs) -
|
|
|
|
|
len(word_level_hyps))
|
2025-04-14 14:23:29 +08:00
|
|
|
|
else:
|
|
|
|
|
word_level_hyps = word_level_hyps[:len(word_level_refs)]
|
|
|
|
|
|
2025-04-14 21:20:29 +08:00
|
|
|
|
sample_score = rouge_score(word_level_hyps,
|
|
|
|
|
word_level_refs)['score']
|
2025-04-14 14:23:29 +08:00
|
|
|
|
scores.append(sample_score)
|
2025-04-14 21:20:29 +08:00
|
|
|
|
details.append({
|
|
|
|
|
'pred': word_level_hyps,
|
|
|
|
|
'answer': word_level_refs,
|
|
|
|
|
'correct': sample_score
|
|
|
|
|
})
|
2025-04-14 14:23:29 +08:00
|
|
|
|
|
|
|
|
|
average_score = sum(scores) / len(scores)
|
2025-04-14 21:20:29 +08:00
|
|
|
|
result = {'AvgRougeScore': average_score, 'details': details}
|
2025-04-14 14:23:29 +08:00
|
|
|
|
random.setstate(random_state)
|
|
|
|
|
np.random.set_state(np_random_state)
|
|
|
|
|
|
|
|
|
|
return self._postprocess(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
|
|
|
class AverageRougeScoreEvaluator(AverageRougeEvaluator):
|
|
|
|
|
"""Average Rouge Score evaluator."""
|
|
|
|
|
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AccScoreStrEvaluator(BaseEvaluator):
|
|
|
|
|
"""Accuracy evaluator based on string matching.
|
|
|
|
|
|
|
|
|
|
Args:
|
2025-04-14 21:20:29 +08:00
|
|
|
|
seed (int): Seed for randomness, ensuring reproducibility.
|
|
|
|
|
Defaults to 0.
|
2025-04-14 14:23:29 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, seed: int = 0) -> None:
|
|
|
|
|
self.seed = seed
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
|
|
|
return {
|
|
|
|
|
'predictions': predictions,
|
|
|
|
|
'references': references,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
|
|
|
return scores
|
|
|
|
|
|
|
|
|
|
def score(self, predictions: List, references: List) -> dict:
|
|
|
|
|
random_state = random.getstate()
|
|
|
|
|
np_random_state = np.random.get_state()
|
|
|
|
|
details = []
|
|
|
|
|
random.seed(self.seed)
|
|
|
|
|
np.random.seed(self.seed)
|
2025-04-14 21:20:29 +08:00
|
|
|
|
|
2025-04-14 14:23:29 +08:00
|
|
|
|
if len(predictions) != len(references):
|
|
|
|
|
return {
|
2025-04-14 21:20:29 +08:00
|
|
|
|
'error':
|
|
|
|
|
'predictions and references have different '
|
|
|
|
|
f'length. len(predictions): {len(predictions)}, '
|
|
|
|
|
f'len(references): {len(references)}'
|
2025-04-14 14:23:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
preprocessed_data = self._preprocess(predictions, references)
|
|
|
|
|
|
|
|
|
|
correct = 0
|
2025-04-14 21:20:29 +08:00
|
|
|
|
for hyp, ref in zip(preprocessed_data['predictions'],
|
|
|
|
|
preprocessed_data['references']):
|
2025-04-14 14:23:29 +08:00
|
|
|
|
is_correct = 1 if ref.strip().lower() in hyp.strip().lower() else 0
|
|
|
|
|
correct += is_correct
|
|
|
|
|
details.append({'pred': hyp, 'answer': ref, 'correct': is_correct})
|
|
|
|
|
|
|
|
|
|
accuracy = correct / len(predictions)
|
2025-04-14 21:20:29 +08:00
|
|
|
|
result = {'ACCStrScore': accuracy * 100, 'details': details}
|
2025-04-14 14:23:29 +08:00
|
|
|
|
random.setstate(random_state)
|
|
|
|
|
np.random.set_state(np_random_state)
|
|
|
|
|
|
|
|
|
|
return self._postprocess(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
|
|
|
class AccScoreStr_Evaluator(AccScoreStrEvaluator):
|
|
|
|
|
"""Accuracy evaluator wrapper for the AccScoreEvaluator."""
|
|
|
|
|
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
super().__init__()
|