From 2349fcff2cdcf0ef4c2df7fbc2e3d198a75210ad Mon Sep 17 00:00:00 2001 From: jnanliu Date: Mon, 24 Feb 2025 06:25:17 +0000 Subject: [PATCH] delete gpassk_evaluator and fix potential errors --- .../livemathbench/livemathbench_gen_9befbf.py | 5 +- .../livemathbench/livemathbench_greedy_gen.py | 2 +- .../livemathbench_greedy_gen_9befbf.py | 5 +- .../datasets/livemathbench/livemathbench.py | 71 ++------ opencompass/openicl/icl_evaluator/__init__.py | 1 - .../icl_evaluator/icl_base_evaluator.py | 25 ++- .../icl_evaluator/icl_gpassk_evaluator.py | 163 ------------------ opencompass/tasks/openicl_eval.py | 6 +- 8 files changed, 40 insertions(+), 238 deletions(-) delete mode 100644 opencompass/openicl/icl_evaluator/icl_gpassk_evaluator.py diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py b/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py index 27b4db56..413475a7 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py @@ -41,10 +41,7 @@ livemathbench_dataset = dict( url=[], use_extract_model=False, extract_url=[], - extract_model_name='', - k=[4, 8, 16], - repeat=3, - thresholds=[0.0, 0.25, 0.5, 0.75, 1.0] + extract_model_name='' ) ) ) diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py index d311eeaf..c1d72d15 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .livemathbench_greedy_gen_efb20d import livemathbench_datasets # noqa: F401, F403 \ No newline at end of file + from .livemathbench_greedy_gen_9befbf import livemathbench_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py index a93c1f47..8b85a4bb 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py @@ -41,10 +41,7 @@ livemathbench_dataset = dict( url=[], use_extract_model=False, extract_url=[], - extract_model_name='', - k=[1], - repeat=1, - thresholds=[0.0] + extract_model_name='' ) ) ) diff --git a/opencompass/datasets/livemathbench/livemathbench.py b/opencompass/datasets/livemathbench/livemathbench.py index 208af7de..e28bb4bd 100644 --- a/opencompass/datasets/livemathbench/livemathbench.py +++ b/opencompass/datasets/livemathbench/livemathbench.py @@ -1,10 +1,9 @@ import os import warnings -from collections import OrderedDict from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial from itertools import product -from typing import Any, Callable, Dict, List, Union +from typing import Any, Callable, Dict, List import jsonlines import mmengine @@ -13,7 +12,7 @@ from datasets import Dataset, load_dataset from opencompass.datasets.math import MATHAgentEvaluator, math_postprocess_v2 from opencompass.models import OpenAISDK -from opencompass.openicl.icl_evaluator import GPassKEvaluator +from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_inferencer.icl_base_inferencer import \ dump_results_dict from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS @@ -107,7 +106,7 @@ class LiveMathBenchDataset(BaseDataset): @ICL_EVALUATORS.register_module() -class LiveMathBenchEvaluator(GPassKEvaluator): +class LiveMathBenchEvaluator(BaseEvaluator): api_meta_template = dict(round=[ dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True), @@ -118,11 +117,8 @@ class LiveMathBenchEvaluator(GPassKEvaluator): url, use_extract_model=False, extract_url=[], - extract_model_name='', - k: Union[int, List[int]] = 16, - repeat: int = 3, - thresholds: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]): - super().__init__(k, repeat, thresholds) + extract_model_name=''): + super().__init__() if isinstance(url, str): url = [url] @@ -303,55 +299,18 @@ class LiveMathBenchEvaluator(GPassKEvaluator): def preprocess(self, predictions, references, test_set): return self.judge(predictions, references, test_set) - def group(self, predictions, labels, test_set): - example2replications = {} - for example, label, prediction in zip(test_set, labels, predictions): - example_abbr = f"{example['subdivision']}_{example['idx']}" - if example_abbr not in example2replications: - example2replications[example_abbr] = [] - example.update({'prediction': prediction, 'label': label}) - example2replications[example_abbr].append(example) - for _, replications in example2replications.items(): - assert len(replications) == self.n, print(len(replications), - self.n) - return example2replications + def score(self, predictions, references, test_set) -> Dict[str, Any]: + labels = self.preprocess(predictions, references, test_set) + results = {'accuracy': 100 * np.mean(labels), 'details': []} - def reduce(self, details) -> Dict[str, Any]: - """Aggregate the overall metrics. + for pred, ref, label in zip(predictions, references, labels): + results['details'].append({ + 'pred': pred, + 'ref': ref, + 'correct': label + }) - Return: - A dict contains overall metrics, like: - {'details': details for each example, 'G-Pass@16': xxx} - """ - g_passk_details = OrderedDict() - g_passk_details['details'] = details - - all_dataset = set([detail['subdivision'] for detail in details]) - - for k in self.k: - for subdivision in sorted(list(all_dataset)): - for threshold in self.thresholds: - g_passk_details[ - f'{subdivision}/G-Pass@{k}_{threshold}'] = \ - 100. * np.mean( - [ - detail[f'G-Pass@{k}_{threshold}'] - for detail in details - if detail['subdivision'] == subdivision - ]) - g_passk_details[f'{subdivision}/mG-Pass@{k}'] = 100. * np.mean( - [ - detail[f'mG-Pass@{k}'] for detail in details - if detail['subdivision'] == subdivision - ]) - - for threshold in self.thresholds: - g_passk_details[f'G-Pass@{k}_{threshold}'] = 100. * np.mean( - [detail[f'G-Pass@{k}_{threshold}'] for detail in details]) - g_passk_details[f'mG-Pass@{k}'] = 100. * np.mean( - [detail[f'mG-Pass@{k}'] for detail in details]) - - return g_passk_details + return results class LiveMathBenchOutputHandler: diff --git a/opencompass/openicl/icl_evaluator/__init__.py b/opencompass/openicl/icl_evaluator/__init__.py index 5103c00d..1fd1683b 100644 --- a/opencompass/openicl/icl_evaluator/__init__.py +++ b/opencompass/openicl/icl_evaluator/__init__.py @@ -4,7 +4,6 @@ from .icl_base_evaluator import BaseEvaluator # noqa from .icl_bpc_evaluator import BPCEvaluator # noqa from .icl_circular_evaluator import CircularEvaluator # noqa from .icl_em_evaluator import EMEvaluator # noqa -from .icl_gpassk_evaluator import GPassKEvaluator # noqa from .icl_hf_evaluator import * # noqa from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa from .icl_misc_evaluator import AverageInferencePPLEvaluator # noqa diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py index a637c38f..3286da79 100644 --- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py @@ -41,6 +41,11 @@ class BaseEvaluator: def __init__(self) -> None: pass + @property + def output_dir(self): + # please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200 + return self._out_dir + def group(self, n: int, details: List[Dict[str, Any]], test_set: Dataset) -> Dict[str, Any]: example2replications = {} @@ -77,18 +82,24 @@ class BaseEvaluator: return g_passk_details def evaluate(self, k: Union[int, List[int]], repeat: int, - test_set: Dataset, **score_kwargs): + original_dataset: Dataset, **score_kwargs): n = (max(k) if isinstance(k, List) else k) * repeat - print(len(score_kwargs['predictions'])) - real_size = len(test_set) // n + real_size = len(original_dataset) // n all_details = [] all_results = [] for i in range(n): + + def select_fn(i, real_size, x): + if isinstance(x, Dataset): + return x.select(range(i * real_size, (i + 1) * real_size)) + elif isinstance(x, Iterable): + return x[i * real_size:(i + 1) * real_size] + else: + return x + results = self.score( **{ - key: - value[i * real_size:(i + 1) * - real_size] if isinstance(value, Iterable) else value + key: select_fn(i, real_size, value) for key, value in score_kwargs.items() }) details = results.pop('details', None) @@ -118,7 +129,7 @@ class BaseEvaluator: else: eval_results[key] = eval_results[key][0] - grouped_examples = self.group(n, all_details, test_set) + grouped_examples = self.group(n, all_details, original_dataset) can_calculate = False if len(all_details) != 0: eval_details = [] diff --git a/opencompass/openicl/icl_evaluator/icl_gpassk_evaluator.py b/opencompass/openicl/icl_evaluator/icl_gpassk_evaluator.py deleted file mode 100644 index 8391a435..00000000 --- a/opencompass/openicl/icl_evaluator/icl_gpassk_evaluator.py +++ /dev/null @@ -1,163 +0,0 @@ -from abc import abstractmethod -from typing import Any, Dict, List, Union - -import numpy as np -from scipy.stats import hypergeom - -from opencompass.registry import ICL_EVALUATORS - -from .icl_base_evaluator import BaseEvaluator - - -def compute_pass_at_k(n, c, k): - if n - c < k: - return 1.0 - return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) - - -def _compute_g_pass_at_k(n, c, k, m): - if m > min(c, k) or k > n or c < 0 or n <= 0 or m < 0: - return 0.0 - return hypergeom.sf(m - 1, n, c, k) - - -def compute_g_pass_at_k(n, c, k, t): - m = max(int(np.ceil(k * t)), 1) - return _compute_g_pass_at_k(n, c, k, m) - - -def compute_mg_pass_at_k(n, c, k): - l, r = int(np.ceil(k * 0.5)), k - - mg_pass_at_k = 0.0 - for i in range(l + 1, r + 1): - mg_pass_at_k += _compute_g_pass_at_k(n, c, k, i) - mg_pass_at_k = 2 * mg_pass_at_k / k - - return mg_pass_at_k - - -@ICL_EVALUATORS.register_module() -class GPassKEvaluator(BaseEvaluator): - """Evaluator for computing the G-Pass@k Metric. - - This evaluator performs the following steps: - 1. Invokes task-specific `preprocess` on predictions to - assign a consistency label to each prediction and its - corresponding reference. - 2. Calculates metrics for each input example based on - these labels. - 3. Aggregates the overall metrics through a task-specific - `postprocess`. - - Args: - k (int or list of int): Number of predictions to be - considered in G-Pass@k. It can be a single integer - (e.g., `k=16` computes G-Pass@16) or a list of - integers (e.g., `[4, 8, 16]` computes G-Pass@4, - G-Pass@8, and G-Pass@16). - - repeat (int): Controls the number of generations - used to estimate G-Pass@k. The total number of - generations is determined by multiplying the - maximum of `k` with `repeat`. This parameter - should be a single integer. - - thresholds (list of float): A list of floating-point - numbers that define the thresholds for the G-Pass@k - metric. - """ - - def __init__( - self, - k: Union[int, List[int]] = 16, - repeat: int = 3, - thresholds: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]) -> None: - super().__init__() - - if isinstance(k, int): - k = [k] - - self.k = k - self.repeat = repeat - self.n = max(k) * repeat - self.thresholds = thresholds - - @property - def output_dir(self): - # please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200 - return self._out_dir - - @abstractmethod - def preprocess(self, predictions, references, test_set) -> None: - """Perform operations on predictions before computing metrics, for - example, do answer_extraction and model_judge in mathematical reasoning - task. - - Return: - labels: A list contains the label which indicates whether - prediction is consistency with reference at each position. - """ - raise NotImplementedError - - @abstractmethod - def group(self, predictions, labels, test_set) -> Dict[str, Any]: - """Group the predictions and references. - - Return: - A dict contains the grouped predictions and references. - """ - raise NotImplementedError - - @abstractmethod - def reduce(self, details) -> Dict[str, Any]: - """Aggregate the overall metrics. - - Return: - A dict contains overall metrics, like: - {'details': details for each example, 'G-Pass@16': xxx} - """ - raise NotImplementedError - - def score(self, predictions, references, test_set) -> Dict[str, Any]: - """Compute G-Pass@k metrics. - - Return: - A dict contains metrics for each dataset sample and - overall metrics reduced by `self.reduce`, like: - {'details': details for each example, 'G-Pass@16': xxx} - """ - labels = self.preprocess(predictions, references, test_set) - grouped_examples = self.group(predictions, labels, test_set) - - details = [] - total_pass_num, count = 0, 0 - for example_abbr, examples in grouped_examples.items(): - detail = { - k: v - for k, v in examples[0].items() - if k not in ['prediction', 'label'] - } - detail.update({ - 'predictions': [{ - 'prediction': example['prediction'], - 'label': example['label'] - } for example in examples], - }) - - current_example_labels = [e['label'] for e in examples] - c = int(np.sum(current_example_labels)) - - for k in self.k: - for threshold in self.thresholds: - detail[f'G-Pass@{k}_{threshold}'] = compute_g_pass_at_k( - n=self.n, c=c, k=k, t=threshold) - detail[f'mG-Pass@{k}'] = compute_mg_pass_at_k(n=self.n, - c=c, - k=k) - count += self.n - total_pass_num += c - - details.append(detail) - - return self.reduce(details) diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py index 960e3a0a..6841225e 100644 --- a/opencompass/tasks/openicl_eval.py +++ b/opencompass/tasks/openicl_eval.py @@ -217,7 +217,8 @@ class OpenICLEvalTask(BaseTask): } k = self.dataset_cfg.get('k', 1) repeat = self.dataset_cfg.get('repeat', 1) - result = icl_evaluator.evaluate(k, repeat, test_set, **preds) + result = icl_evaluator.evaluate(k, repeat, copy.deepcopy(test_set), + **preds) # Get model postprocess result model_details = None @@ -225,7 +226,8 @@ class OpenICLEvalTask(BaseTask): if 'model_postprocessor' in self.eval_cfg: model_preds = copy.deepcopy(preds) model_preds['predictions'] = model_pred_strs - model_result = icl_evaluator.evaluate(k, repeat, test_set, + model_result = icl_evaluator.evaluate(k, repeat, + copy.deepcopy(test_set), **model_preds) for key in model_result: if key == 'details':