refactor: delete unnecessary comment

This commit is contained in:
chenzihong-gavin 2025-04-14 21:20:29 +08:00
parent f9b1636598
commit db04df78d4
3 changed files with 85 additions and 133 deletions

View File

@ -53,7 +53,6 @@ dataset_configs = [
{'abbr': 'seedbench_3-5', 'data_file': '3-5.json', 'evaluator': 'AccScoreStr_Evaluator'}, {'abbr': 'seedbench_3-5', 'data_file': '3-5.json', 'evaluator': 'AccScoreStr_Evaluator'},
] ]
seedbench_datasets = [] seedbench_datasets = []
for stage in ['zero-shot','one-shot']: for stage in ['zero-shot','one-shot']:
for config in dataset_configs: for config in dataset_configs:

View File

@ -1,27 +1,34 @@
import os
import random import random
import datasets
from typing import List
from .base import BaseDataset
from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
import numpy as np
import re import re
from typing import List
import datasets
import jieba import jieba
import numpy as np
from rouge_chinese import Rouge from rouge_chinese import Rouge
from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, TEXT_POSTPROCESSORS from opencompass.registry import ICL_EVALUATORS, TEXT_POSTPROCESSORS
from .base import BaseDataset
class SeedBenchDataset(BaseDataset): class SeedBenchDataset(BaseDataset):
@staticmethod @staticmethod
def load(data_files: str, path: str = 'json', split: str = None, **kwargs) -> datasets.Dataset: def load(data_files: str,
path: str = 'json',
split: str = None,
**kwargs) -> datasets.Dataset:
dataset = datasets.load_dataset(path, data_files=data_files, **kwargs) dataset = datasets.load_dataset(path, data_files=data_files, **kwargs)
if split is None: if split is None:
split = list(dataset.keys())[0] split = list(dataset.keys())[0]
print(f"my datasets split : {split}") print(f'my datasets split : {split}')
if split not in dataset: if split not in dataset:
raise ValueError(f"Split '{split}' not found. Available splits: {list(dataset.keys())}") raise ValueError(f"Split '{split}' not found. \
Available splits: {list(dataset.keys())}")
return dataset[split] return dataset[split]
@ -30,7 +37,8 @@ class F1Evaluator(BaseEvaluator):
"""F1 Score evaluator for multiple choice questions. """F1 Score evaluator for multiple choice questions.
Args: Args:
seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. seed (int): Seed for randomness, ensuring reproducibility.
Defaults to 0.
""" """
def __init__(self, seed: int = 0) -> None: def __init__(self, seed: int = 0) -> None:
@ -38,41 +46,15 @@ class F1Evaluator(BaseEvaluator):
super().__init__() super().__init__()
def _preprocess(self, predictions: List, references: List) -> dict: def _preprocess(self, predictions: List, references: List) -> dict:
"""Preprocess the final predictions and references to needed format.
Args:
predictions (List): List of predictions for each sample.
references (List): List of reference answers for each sample.
Returns:
dict: Preprocessed predictions and references in the required format.
"""
return { return {
'predictions': predictions, 'predictions': predictions,
'references': references, 'references': references,
} }
def _postprocess(self, scores: dict) -> dict: def _postprocess(self, scores: dict) -> dict:
"""Postprocess the final score for F1.
Args:
scores (dict): Dictionary of calculated F1 score.
Returns:
dict: Postprocessed F1 score.
"""
return scores return scores
def score(self, predictions: List, references: List) -> dict: def score(self, predictions: List, references: List) -> dict:
"""Calculate F1 score.
Args:
predictions (List): List of predicted answers for each sample.
references (List): List of reference answers for each sample.
Returns:
dict: Calculated F1 score.
"""
random_state = random.getstate() random_state = random.getstate()
np_random_state = np.random.get_state() np_random_state = np.random.get_state()
details = [] details = []
@ -82,7 +64,8 @@ class F1Evaluator(BaseEvaluator):
if len(predictions) != len(references): if len(predictions) != len(references):
return { return {
'error': 'predictions and references have different ' 'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, ' f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}' f'len(references): {len(references)}'
} }
@ -105,18 +88,29 @@ class F1Evaluator(BaseEvaluator):
true_positives += sample_tp true_positives += sample_tp
false_positives += sample_fp false_positives += sample_fp
false_negatives += sample_fn false_negatives += sample_fn
sample_precision = sample_tp / (sample_tp + sample_fp) if (sample_tp + sample_fp) > 0 else 0 sample_precision = sample_tp / (sample_tp + sample_fp) if (
sample_recall = sample_tp / (sample_tp + sample_fn) if (sample_tp + sample_fn) > 0 else 0 sample_tp + sample_fp) > 0 else 0
sample_f1 = (2 * sample_precision * sample_recall) / (sample_precision + sample_recall) if (sample_precision + sample_recall) > 0 else 0 sample_recall = sample_tp / (sample_tp + sample_fn) if (
details.append({'pred': hyp, 'answer': ref, 'correct': sample_f1 * 100}) sample_tp + sample_fn) > 0 else 0
sample_f1 = (2 * sample_precision * sample_recall) / (
sample_precision + sample_recall) if (sample_precision +
sample_recall) > 0 else 0
details.append({
'pred': hyp,
'answer': ref,
'correct': sample_f1 * 100
})
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0 precision = true_positives / (true_positives + false_positives) if (
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0 true_positives + false_positives) > 0 else 0
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 recall = true_positives / (true_positives + false_negatives) if (
true_positives + false_negatives) > 0 else 0
f1 = (2 * precision *
recall) / (precision + recall) if (precision + recall) > 0 else 0
result = { result = {
"ours_F1Score": f1 * 100, # 总体 F1 分数 'ours_F1Score': f1 * 100, # 总体 F1 分数
"details": details 'details': details
} }
random.setstate(random_state) random.setstate(random_state)
np.random.set_state(np_random_state) np.random.set_state(np_random_state)
@ -126,6 +120,7 @@ class F1Evaluator(BaseEvaluator):
@ICL_EVALUATORS.register_module() @ICL_EVALUATORS.register_module()
class F1ScoreEvaluator(F1Evaluator): class F1ScoreEvaluator(F1Evaluator):
"""F1 Score evaluator for multiple choice questions.""" """F1 Score evaluator for multiple choice questions."""
def __init__(self) -> None: def __init__(self) -> None:
super().__init__() super().__init__()
@ -143,7 +138,8 @@ class AverageRougeEvaluator(BaseEvaluator):
"""Average Rouge Score evaluator for fill-in-the-blank tasks. """Average Rouge Score evaluator for fill-in-the-blank tasks.
Args: Args:
seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. seed (int): Seed for randomness, ensuring reproducibility.
Defaults to 0.
""" """
def __init__(self, seed: int = 0) -> None: def __init__(self, seed: int = 0) -> None:
@ -151,17 +147,11 @@ class AverageRougeEvaluator(BaseEvaluator):
super().__init__() super().__init__()
def _preprocess(self, predictions: List, references: List) -> dict: def _preprocess(self, predictions: List, references: List) -> dict:
"""Preprocess the final predictions and references to needed format. pattern = r'(正确答案[:]|correct answer[:])'
cleaned_predictions = [
Args: re.sub(pattern, '', pred, flags=re.IGNORECASE).strip()
predictions (List): List of predictions for each sample. for pred in predictions
references (List): List of reference answers for each sample. ]
Returns:
dict: Preprocessed predictions and references in the required format.
"""
pattern = r"(正确答案[:]|correct answer[:])"
cleaned_predictions = [re.sub(pattern, "", pred, flags=re.IGNORECASE).strip() for pred in predictions]
return { return {
'predictions': cleaned_predictions, 'predictions': cleaned_predictions,
@ -169,35 +159,19 @@ class AverageRougeEvaluator(BaseEvaluator):
} }
def _postprocess(self, scores: dict) -> dict: def _postprocess(self, scores: dict) -> dict:
"""Postprocess the final Rouge scores.
Args:
scores (dict): Dictionary of calculated average Rouge scores.
Returns:
dict: Postprocessed Rouge scores.
"""
return scores return scores
def score(self, predictions: List, references: List) -> dict: def score(self, predictions: List, references: List) -> dict:
"""Calculate average Rouge-L score.
Args:
predictions (List): List of predicted strings for each sample.
references (List): List of reference strings for each sample.
Returns:
dict: Calculated average Rouge-L score.
"""
def rouge_score(hyps, refs): def rouge_score(hyps, refs):
assert (len(hyps) == len(refs)) assert (len(hyps) == len(refs))
hyps = [' '.join(jieba.cut(h)) for h in hyps] hyps = [' '.join(jieba.cut(h)) for h in hyps]
hyps = [h if h.strip() != "" else "无内容" for h in hyps] hyps = [h if h.strip() != '' else '无内容' for h in hyps]
refs = [' '.join(jieba.cut(r)) for r in refs] refs = [' '.join(jieba.cut(r)) for r in refs]
rouge_scores = Rouge().get_scores(hyps, refs) rouge_scores = Rouge().get_scores(hyps, refs)
rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] rouge_ls = [score['rouge-l']['f'] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls) average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {"score": average_rouge_l * 100} return {'score': average_rouge_l * 100}
random_state = random.getstate() random_state = random.getstate()
np_random_state = np.random.get_state() np_random_state = np.random.get_state()
@ -207,13 +181,15 @@ class AverageRougeEvaluator(BaseEvaluator):
if len(predictions) != len(references): if len(predictions) != len(references):
return { return {
'error': 'predictions and references have different ' 'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, ' f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}' f'len(references): {len(references)}'
} }
preprocessed_data = self._preprocess(predictions, references) preprocessed_data = self._preprocess(predictions, references)
hyps, refs = preprocessed_data['predictions'], preprocessed_data['references'] hyps, refs = preprocessed_data['predictions'], preprocessed_data[
'references']
scores = [] scores = []
for i in range(len(hyps)): for i in range(len(hyps)):
@ -227,19 +203,22 @@ class AverageRougeEvaluator(BaseEvaluator):
word_level_hyps = [h.strip() for h in word_level_hyps] word_level_hyps = [h.strip() for h in word_level_hyps]
if len(word_level_hyps) < len(word_level_refs): if len(word_level_hyps) < len(word_level_refs):
word_level_hyps += ['无内容'] * (len(word_level_refs) - len(word_level_hyps)) word_level_hyps += ['无内容'] * (len(word_level_refs) -
len(word_level_hyps))
else: else:
word_level_hyps = word_level_hyps[:len(word_level_refs)] word_level_hyps = word_level_hyps[:len(word_level_refs)]
sample_score = rouge_score(word_level_hyps, word_level_refs)["score"] sample_score = rouge_score(word_level_hyps,
word_level_refs)['score']
scores.append(sample_score) scores.append(sample_score)
details.append({'pred': word_level_hyps, 'answer': word_level_refs, 'correct': sample_score}) details.append({
'pred': word_level_hyps,
'answer': word_level_refs,
'correct': sample_score
})
average_score = sum(scores) / len(scores) average_score = sum(scores) / len(scores)
result = { result = {'AvgRougeScore': average_score, 'details': details}
"AvgRougeScore": average_score,
"details": details
}
random.setstate(random_state) random.setstate(random_state)
np.random.set_state(np_random_state) np.random.set_state(np_random_state)
@ -258,7 +237,8 @@ class AccScoreStrEvaluator(BaseEvaluator):
"""Accuracy evaluator based on string matching. """Accuracy evaluator based on string matching.
Args: Args:
seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. seed (int): Seed for randomness, ensuring reproducibility.
Defaults to 0.
""" """
def __init__(self, seed: int = 0) -> None: def __init__(self, seed: int = 0) -> None:
@ -266,41 +246,15 @@ class AccScoreStrEvaluator(BaseEvaluator):
super().__init__() super().__init__()
def _preprocess(self, predictions: List, references: List) -> dict: def _preprocess(self, predictions: List, references: List) -> dict:
"""Preprocess the final predictions and references to needed format.
Args:
predictions (List): List of predictions for each sample.
references (List): List of reference answers for each sample.
Returns:
dict: Preprocessed predictions and references in the required format.
"""
return { return {
'predictions': predictions, 'predictions': predictions,
'references': references, 'references': references,
} }
def _postprocess(self, scores: dict) -> dict: def _postprocess(self, scores: dict) -> dict:
"""Postprocess the final accuracy score.
Args:
scores (dict): Dictionary of calculated accuracy score.
Returns:
dict: Postprocessed accuracy score.
"""
return scores return scores
def score(self, predictions: List, references: List) -> dict: def score(self, predictions: List, references: List) -> dict:
"""Calculate accuracy score.
Args:
predictions (List): List of predicted strings for each sample.
references (List): List of reference strings for each sample.
Returns:
dict: Calculated accuracy score.
"""
random_state = random.getstate() random_state = random.getstate()
np_random_state = np.random.get_state() np_random_state = np.random.get_state()
details = [] details = []
@ -309,7 +263,8 @@ class AccScoreStrEvaluator(BaseEvaluator):
if len(predictions) != len(references): if len(predictions) != len(references):
return { return {
'error': 'predictions and references have different ' 'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, ' f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}' f'len(references): {len(references)}'
} }
@ -317,16 +272,14 @@ class AccScoreStrEvaluator(BaseEvaluator):
preprocessed_data = self._preprocess(predictions, references) preprocessed_data = self._preprocess(predictions, references)
correct = 0 correct = 0
for hyp, ref in zip(preprocessed_data['predictions'], preprocessed_data['references']): for hyp, ref in zip(preprocessed_data['predictions'],
preprocessed_data['references']):
is_correct = 1 if ref.strip().lower() in hyp.strip().lower() else 0 is_correct = 1 if ref.strip().lower() in hyp.strip().lower() else 0
correct += is_correct correct += is_correct
details.append({'pred': hyp, 'answer': ref, 'correct': is_correct}) details.append({'pred': hyp, 'answer': ref, 'correct': is_correct})
accuracy = correct / len(predictions) accuracy = correct / len(predictions)
result = { result = {'ACCStrScore': accuracy * 100, 'details': details}
"ACCStrScore": accuracy * 100,
"details": details
}
random.setstate(random_state) random.setstate(random_state)
np.random.set_state(np_random_state) np.random.set_state(np_random_state)