From db04df78d4e0ab858e021f00ec47d8066178f7c6 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 14 Apr 2025 21:20:29 +0800 Subject: [PATCH] refactor: delete unnecessary comment --- .../configs/datasets/SeedBench/README.md | 2 +- ..._gen_44868b.py => seedbench_gen_5d5ea1.py} | 1 - opencompass/datasets/SeedBench.py | 215 +++++++----------- 3 files changed, 85 insertions(+), 133 deletions(-) rename opencompass/configs/datasets/SeedBench/{seedbench_gen_44868b.py => seedbench_gen_5d5ea1.py} (99%) diff --git a/opencompass/configs/datasets/SeedBench/README.md b/opencompass/configs/datasets/SeedBench/README.md index c78f852a..0963db24 100644 --- a/opencompass/configs/datasets/SeedBench/README.md +++ b/opencompass/configs/datasets/SeedBench/README.md @@ -76,4 +76,4 @@ We evaluated 26 LLMs, including proprietary, open-source, and domain-specific mo | DeepSeek-V3-671B | 56.03| 62.42| 74.81| 63.17| 55.23| 58.84| 68.23| 69.04| 66.46| 68.48| 63.30| | Qwen2-72B | 51.16| 58.10| 74.07| 59.72| 51.58| 57.76| 58.85| 61.63| 56.69| 59.11| 57.62| -- **Top Performers**: DeepSeek-V3-671B (63.30), GPT-4 (62.06). \ No newline at end of file +- **Top Performers**: DeepSeek-V3-671B (63.30), GPT-4 (62.06). diff --git a/opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py b/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py similarity index 99% rename from opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py rename to opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py index ba053fc9..07661901 100644 --- a/opencompass/configs/datasets/SeedBench/seedbench_gen_44868b.py +++ b/opencompass/configs/datasets/SeedBench/seedbench_gen_5d5ea1.py @@ -53,7 +53,6 @@ dataset_configs = [ {'abbr': 'seedbench_3-5', 'data_file': '3-5.json', 'evaluator': 'AccScoreStr_Evaluator'}, ] - seedbench_datasets = [] for stage in ['zero-shot','one-shot']: for config in dataset_configs: diff --git a/opencompass/datasets/SeedBench.py b/opencompass/datasets/SeedBench.py index f7c97b5d..3e0d4902 100644 --- a/opencompass/datasets/SeedBench.py +++ b/opencompass/datasets/SeedBench.py @@ -1,27 +1,34 @@ -import os import random -import datasets -from typing import List -from .base import BaseDataset -from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator -import numpy as np import re +from typing import List + +import datasets import jieba +import numpy as np from rouge_chinese import Rouge + +from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, TEXT_POSTPROCESSORS +from .base import BaseDataset + class SeedBenchDataset(BaseDataset): + @staticmethod - def load(data_files: str, path: str = 'json', split: str = None, **kwargs) -> datasets.Dataset: + def load(data_files: str, + path: str = 'json', + split: str = None, + **kwargs) -> datasets.Dataset: dataset = datasets.load_dataset(path, data_files=data_files, **kwargs) if split is None: split = list(dataset.keys())[0] - print(f"my datasets split : {split}") + print(f'my datasets split : {split}') if split not in dataset: - raise ValueError(f"Split '{split}' not found. Available splits: {list(dataset.keys())}") + raise ValueError(f"Split '{split}' not found. \ + Available splits: {list(dataset.keys())}") return dataset[split] @@ -30,7 +37,8 @@ class F1Evaluator(BaseEvaluator): """F1 Score evaluator for multiple choice questions. Args: - seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + seed (int): Seed for randomness, ensuring reproducibility. + Defaults to 0. """ def __init__(self, seed: int = 0) -> None: @@ -38,53 +46,28 @@ class F1Evaluator(BaseEvaluator): super().__init__() def _preprocess(self, predictions: List, references: List) -> dict: - """Preprocess the final predictions and references to needed format. - - Args: - predictions (List): List of predictions for each sample. - references (List): List of reference answers for each sample. - - Returns: - dict: Preprocessed predictions and references in the required format. - """ return { 'predictions': predictions, 'references': references, } def _postprocess(self, scores: dict) -> dict: - """Postprocess the final score for F1. - - Args: - scores (dict): Dictionary of calculated F1 score. - - Returns: - dict: Postprocessed F1 score. - """ return scores def score(self, predictions: List, references: List) -> dict: - """Calculate F1 score. - - Args: - predictions (List): List of predicted answers for each sample. - references (List): List of reference answers for each sample. - - Returns: - dict: Calculated F1 score. - """ random_state = random.getstate() np_random_state = np.random.get_state() details = [] random.seed(self.seed) np.random.seed(self.seed) - + if len(predictions) != len(references): return { - 'error': 'predictions and references have different ' - f'length. len(predictions): {len(predictions)}, ' - f'len(references): {len(references)}' + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' } true_positives = 0 @@ -98,34 +81,46 @@ class F1Evaluator(BaseEvaluator): hyp_set = set(hyp.split(',')) ref_set = {r.strip() for r in ref_set} hyp_set = {h.strip() for h in hyp_set} - + sample_tp = len(hyp_set.intersection(ref_set)) sample_fp = len(hyp_set - ref_set) sample_fn = len(ref_set - hyp_set) true_positives += sample_tp false_positives += sample_fp false_negatives += sample_fn - sample_precision = sample_tp / (sample_tp + sample_fp) if (sample_tp + sample_fp) > 0 else 0 - sample_recall = sample_tp / (sample_tp + sample_fn) if (sample_tp + sample_fn) > 0 else 0 - sample_f1 = (2 * sample_precision * sample_recall) / (sample_precision + sample_recall) if (sample_precision + sample_recall) > 0 else 0 - details.append({'pred': hyp, 'answer': ref, 'correct': sample_f1 * 100}) + sample_precision = sample_tp / (sample_tp + sample_fp) if ( + sample_tp + sample_fp) > 0 else 0 + sample_recall = sample_tp / (sample_tp + sample_fn) if ( + sample_tp + sample_fn) > 0 else 0 + sample_f1 = (2 * sample_precision * sample_recall) / ( + sample_precision + sample_recall) if (sample_precision + + sample_recall) > 0 else 0 + details.append({ + 'pred': hyp, + 'answer': ref, + 'correct': sample_f1 * 100 + }) - precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0 - recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0 - f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 + precision = true_positives / (true_positives + false_positives) if ( + true_positives + false_positives) > 0 else 0 + recall = true_positives / (true_positives + false_negatives) if ( + true_positives + false_negatives) > 0 else 0 + f1 = (2 * precision * + recall) / (precision + recall) if (precision + recall) > 0 else 0 result = { - "ours_F1Score": f1 * 100, # 总体 F1 分数 - "details": details + 'ours_F1Score': f1 * 100, # 总体 F1 分数 + 'details': details } random.setstate(random_state) np.random.set_state(np_random_state) return self._postprocess(result) - + @ICL_EVALUATORS.register_module() class F1ScoreEvaluator(F1Evaluator): """F1 Score evaluator for multiple choice questions.""" + def __init__(self) -> None: super().__init__() @@ -143,7 +138,8 @@ class AverageRougeEvaluator(BaseEvaluator): """Average Rouge Score evaluator for fill-in-the-blank tasks. Args: - seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + seed (int): Seed for randomness, ensuring reproducibility. + Defaults to 0. """ def __init__(self, seed: int = 0) -> None: @@ -151,17 +147,11 @@ class AverageRougeEvaluator(BaseEvaluator): super().__init__() def _preprocess(self, predictions: List, references: List) -> dict: - """Preprocess the final predictions and references to needed format. - - Args: - predictions (List): List of predictions for each sample. - references (List): List of reference answers for each sample. - - Returns: - dict: Preprocessed predictions and references in the required format. - """ - pattern = r"(正确答案[::]|correct answer[::])" - cleaned_predictions = [re.sub(pattern, "", pred, flags=re.IGNORECASE).strip() for pred in predictions] + pattern = r'(正确答案[::]|correct answer[::])' + cleaned_predictions = [ + re.sub(pattern, '', pred, flags=re.IGNORECASE).strip() + for pred in predictions + ] return { 'predictions': cleaned_predictions, @@ -169,36 +159,20 @@ class AverageRougeEvaluator(BaseEvaluator): } def _postprocess(self, scores: dict) -> dict: - """Postprocess the final Rouge scores. - - Args: - scores (dict): Dictionary of calculated average Rouge scores. - - Returns: - dict: Postprocessed Rouge scores. - """ return scores def score(self, predictions: List, references: List) -> dict: - """Calculate average Rouge-L score. - Args: - predictions (List): List of predicted strings for each sample. - references (List): List of reference strings for each sample. - - Returns: - dict: Calculated average Rouge-L score. - """ def rouge_score(hyps, refs): - assert(len(hyps) == len(refs)) + assert (len(hyps) == len(refs)) hyps = [' '.join(jieba.cut(h)) for h in hyps] - hyps = [h if h.strip() != "" else "无内容" for h in hyps] + hyps = [h if h.strip() != '' else '无内容' for h in hyps] refs = [' '.join(jieba.cut(r)) for r in refs] rouge_scores = Rouge().get_scores(hyps, refs) - rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] + rouge_ls = [score['rouge-l']['f'] for score in rouge_scores] average_rouge_l = sum(rouge_ls) / len(rouge_ls) - return {"score": average_rouge_l * 100} - + return {'score': average_rouge_l * 100} + random_state = random.getstate() np_random_state = np.random.get_state() details = [] @@ -207,13 +181,15 @@ class AverageRougeEvaluator(BaseEvaluator): if len(predictions) != len(references): return { - 'error': 'predictions and references have different ' - f'length. len(predictions): {len(predictions)}, ' - f'len(references): {len(references)}' + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' } preprocessed_data = self._preprocess(predictions, references) - hyps, refs = preprocessed_data['predictions'], preprocessed_data['references'] + hyps, refs = preprocessed_data['predictions'], preprocessed_data[ + 'references'] scores = [] for i in range(len(hyps)): @@ -227,19 +203,22 @@ class AverageRougeEvaluator(BaseEvaluator): word_level_hyps = [h.strip() for h in word_level_hyps] if len(word_level_hyps) < len(word_level_refs): - word_level_hyps += ['无内容'] * (len(word_level_refs) - len(word_level_hyps)) + word_level_hyps += ['无内容'] * (len(word_level_refs) - + len(word_level_hyps)) else: word_level_hyps = word_level_hyps[:len(word_level_refs)] - sample_score = rouge_score(word_level_hyps, word_level_refs)["score"] + sample_score = rouge_score(word_level_hyps, + word_level_refs)['score'] scores.append(sample_score) - details.append({'pred': word_level_hyps, 'answer': word_level_refs, 'correct': sample_score}) + details.append({ + 'pred': word_level_hyps, + 'answer': word_level_refs, + 'correct': sample_score + }) average_score = sum(scores) / len(scores) - result = { - "AvgRougeScore": average_score, - "details": details - } + result = {'AvgRougeScore': average_score, 'details': details} random.setstate(random_state) np.random.set_state(np_random_state) @@ -258,7 +237,8 @@ class AccScoreStrEvaluator(BaseEvaluator): """Accuracy evaluator based on string matching. Args: - seed (int): Seed for randomness, ensuring reproducibility. Defaults to 0. + seed (int): Seed for randomness, ensuring reproducibility. + Defaults to 0. """ def __init__(self, seed: int = 0) -> None: @@ -266,67 +246,40 @@ class AccScoreStrEvaluator(BaseEvaluator): super().__init__() def _preprocess(self, predictions: List, references: List) -> dict: - """Preprocess the final predictions and references to needed format. - - Args: - predictions (List): List of predictions for each sample. - references (List): List of reference answers for each sample. - - Returns: - dict: Preprocessed predictions and references in the required format. - """ return { 'predictions': predictions, 'references': references, } def _postprocess(self, scores: dict) -> dict: - """Postprocess the final accuracy score. - - Args: - scores (dict): Dictionary of calculated accuracy score. - - Returns: - dict: Postprocessed accuracy score. - """ return scores def score(self, predictions: List, references: List) -> dict: - """Calculate accuracy score. - - Args: - predictions (List): List of predicted strings for each sample. - references (List): List of reference strings for each sample. - - Returns: - dict: Calculated accuracy score. - """ random_state = random.getstate() np_random_state = np.random.get_state() details = [] random.seed(self.seed) np.random.seed(self.seed) - + if len(predictions) != len(references): return { - 'error': 'predictions and references have different ' - f'length. len(predictions): {len(predictions)}, ' - f'len(references): {len(references)}' + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' } preprocessed_data = self._preprocess(predictions, references) correct = 0 - for hyp, ref in zip(preprocessed_data['predictions'], preprocessed_data['references']): + for hyp, ref in zip(preprocessed_data['predictions'], + preprocessed_data['references']): is_correct = 1 if ref.strip().lower() in hyp.strip().lower() else 0 correct += is_correct details.append({'pred': hyp, 'answer': ref, 'correct': is_correct}) accuracy = correct / len(predictions) - result = { - "ACCStrScore": accuracy * 100, - "details": details - } + result = {'ACCStrScore': accuracy * 100, 'details': details} random.setstate(random_state) np.random.set_state(np_random_state)