OpenCompass/opencompass/openicl/icl_evaluator/icl_gpassk_evaluator.py

from abc import abstractmethod
from typing import Any, Dict, List, Union

import numpy as np
from scipy.stats import hypergeom

from opencompass.registry import ICL_EVALUATORS

from .icl_base_evaluator import BaseEvaluator


def compute_pass_at_k(n, c, k):
    if n - c < k:
        return 1.0
    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))


def _compute_g_pass_at_k(n, c, k, m):
    if m > min(c, k) or k > n or c < 0 or n <= 0 or m < 0:
        return 0.0
    return hypergeom.sf(m - 1, n, c, k)


def compute_g_pass_at_k(n, c, k, t):
    m = max(int(np.ceil(k * t)), 1)
    return _compute_g_pass_at_k(n, c, k, m)


def compute_mg_pass_at_k(n, c, k):
    l, r = int(np.ceil(k * 0.5)), k

    mg_pass_at_k = 0.0
    for i in range(l + 1, r + 1):
        mg_pass_at_k += _compute_g_pass_at_k(n, c, k, i)
    mg_pass_at_k = 2 * mg_pass_at_k / k

    return mg_pass_at_k


@ICL_EVALUATORS.register_module()
class GPassKEvaluator(BaseEvaluator):
    """Evaluator for computing the G-Pass@k Metric.

    This evaluator performs the following steps:
    1. Invokes task-specific `preprocess` on predictions to
    assign a consistency label to each prediction and its
    corresponding reference.
    2. Calculates metrics for each input example based on
    these labels.
    3. Aggregates the overall metrics through a task-specific
    `postprocess`.

    Args:
        k (int or list of int): Number of predictions to be
        considered in G-Pass@k. It can be a single integer
        (e.g., `k=16` computes G-Pass@16) or a list of
        integers (e.g., `[4, 8, 16]` computes G-Pass@4,
        G-Pass@8, and G-Pass@16).

        replication (int): Controls the number of generations
        used to estimate G-Pass@k. The total number of
        generations is determined by multiplying the
        maximum of `k` with `replication`. This parameter
        should be a single integer.

        thresholds (list of float): A list of floating-point
        numbers that define the thresholds for the G-Pass@k
        metric.
    """

    def __init__(
            self,
            k: Union[int, List[int]] = 16,
            replication: int = 3,
            thresholds: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]) -> None:
        super().__init__()

        if isinstance(k, int):
            k = [k]

        self.k = k
        self.replication = replication
        self.n = max(k) * replication
        self.thresholds = thresholds

    @property
    def output_dir(self):
        # please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
        return self._out_dir

    @abstractmethod
    def preprocess(self, predictions, references, test_set) -> None:
        """Perform operations on predictions before computing metrics, for
        example, do answer_extraction and model_judge in mathematical reasoning
        task.

        Return:
            labels: A list contains the label which indicates whether
            prediction is consistency with reference at each position.
        """
        raise NotImplementedError

    @abstractmethod
    def group(self, predictions, labels, test_set) -> Dict[str, Any]:
        """Group the predictions and references.

        Return:
            A dict contains the grouped predictions and references.
        """
        raise NotImplementedError

    @abstractmethod
    def reduce(self, details) -> Dict[str, Any]:
        """Aggregate the overall metrics.

        Return:
            A dict contains overall metrics, like:
            {'details': details for each example, 'G-Pass@16': xxx}
        """
        raise NotImplementedError

    def score(self, predictions, references, test_set) -> Dict[str, Any]:
        """Compute G-Pass@k metrics.

        Return:
            A dict contains  metrics for each dataset sample and
            overall metrics reduced by `self.reduce`, like:
            {'details': details for each example, 'G-Pass@16': xxx}
        """
        labels = self.preprocess(predictions, references, test_set)
        grouped_examples = self.group(predictions, labels, test_set)

        details = []
        total_pass_num, count = 0, 0
        for example_abbr, examples in grouped_examples.items():
            detail = {
                k: v
                for k, v in examples[0].items()
                if k not in ['prediction', 'label']
            }
            detail.update({
                'predictions': [{
                    'prediction': example['prediction'],
                    'label': example['label']
                } for example in examples],
            })

            current_example_labels = [e['label'] for e in examples]
            c = int(np.sum(current_example_labels))

            for k in self.k:
                for threshold in self.thresholds:
                    detail[f'G-Pass@{k}_{threshold}'] = compute_g_pass_at_k(
                        n=self.n, c=c, k=k, t=threshold)
                detail[f'mG-Pass@{k}'] = compute_mg_pass_at_k(n=self.n,
                                                              c=c,
                                                              k=k)
            count += self.n
            total_pass_num += c

            details.append(detail)

        return self.reduce(details)
[Feature] Support G-Pass@k and LiveMathBench (#1772) * support G-Pass@k and livemathbench * fix bugs * fix comments of GPassKEvaluator * update saved details of GPassKEvaluator * update saved details of GPassKEvaluator * fix eval api configs & update openai_api for ease of debugging * update huggingface path * fix method name of G-Pass@k * fix default value of eval_model_name * refactor G-Pass@k evaluator * log generation params for each backend * fix evaluation resume * add notimplementerror 2024-12-30 16:59:39 +08:00			`from abc import abstractmethod`
			`from typing import Any, Dict, List, Union`

			`import numpy as np`
			`from scipy.stats import hypergeom`

			`from opencompass.registry import ICL_EVALUATORS`

			`from .icl_base_evaluator import BaseEvaluator`


			`def compute_pass_at_k(n, c, k):`
			`if n - c < k:`
			`return 1.0`
			`return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))`


			`def _compute_g_pass_at_k(n, c, k, m):`
			`if m > min(c, k) or k > n or c < 0 or n <= 0 or m < 0:`
			`return 0.0`
			`return hypergeom.sf(m - 1, n, c, k)`


			`def compute_g_pass_at_k(n, c, k, t):`
			`m = max(int(np.ceil(k * t)), 1)`
			`return _compute_g_pass_at_k(n, c, k, m)`


			`def compute_mg_pass_at_k(n, c, k):`
			`l, r = int(np.ceil(k * 0.5)), k`

			`mg_pass_at_k = 0.0`
			`for i in range(l + 1, r + 1):`
			`mg_pass_at_k += _compute_g_pass_at_k(n, c, k, i)`
			`mg_pass_at_k = 2 * mg_pass_at_k / k`

			`return mg_pass_at_k`


			`@ICL_EVALUATORS.register_module()`
			`class GPassKEvaluator(BaseEvaluator):`
			`"""Evaluator for computing the G-Pass@k Metric.`

			`This evaluator performs the following steps:`
			1. Invokes task-specific `preprocess` on predictions to
			`assign a consistency label to each prediction and its`
			`corresponding reference.`
			`2. Calculates metrics for each input example based on`
			`these labels.`
			`3. Aggregates the overall metrics through a task-specific`
			`postprocess`.

			`Args:`
			`k (int or list of int): Number of predictions to be`
			`considered in G-Pass@k. It can be a single integer`
			(e.g., `k=16` computes G-Pass@16) or a list of
			integers (e.g., `[4, 8, 16]` computes G-Pass@4,
			`G-Pass@8, and G-Pass@16).`

			`replication (int): Controls the number of generations`
			`used to estimate G-Pass@k. The total number of`
			`generations is determined by multiplying the`
			maximum of `k` with `replication`. This parameter
			`should be a single integer.`

			`thresholds (list of float): A list of floating-point`
			`numbers that define the thresholds for the G-Pass@k`
			`metric.`
			`"""`

			`def __init__(`
			`self,`
			`k: Union[int, List[int]] = 16,`
			`replication: int = 3,`
			`thresholds: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]) -> None:`
			`super().__init__()`

			`if isinstance(k, int):`
			`k = [k]`

			`self.k = k`
			`self.replication = replication`
			`self.n = max(k) * replication`
			`self.thresholds = thresholds`

			`@property`
			`def output_dir(self):`
			`# please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200`
			`return self._out_dir`

			`@abstractmethod`
			`def preprocess(self, predictions, references, test_set) -> None:`
			`"""Perform operations on predictions before computing metrics, for`
			`example, do answer_extraction and model_judge in mathematical reasoning`
			`task.`

			`Return:`
			`labels: A list contains the label which indicates whether`
			`prediction is consistency with reference at each position.`
			`"""`
			`raise NotImplementedError`

			`@abstractmethod`
			`def group(self, predictions, labels, test_set) -> Dict[str, Any]:`
			`"""Group the predictions and references.`

			`Return:`
			`A dict contains the grouped predictions and references.`
			`"""`
			`raise NotImplementedError`

			`@abstractmethod`
			`def reduce(self, details) -> Dict[str, Any]:`
			`"""Aggregate the overall metrics.`

			`Return:`
			`A dict contains overall metrics, like:`
			`{'details': details for each example, 'G-Pass@16': xxx}`
			`"""`
			`raise NotImplementedError`

			`def score(self, predictions, references, test_set) -> Dict[str, Any]:`
			`"""Compute G-Pass@k metrics.`

			`Return:`
			`A dict contains metrics for each dataset sample and`
			overall metrics reduced by `self.reduce`, like:
			`{'details': details for each example, 'G-Pass@16': xxx}`
			`"""`
			`labels = self.preprocess(predictions, references, test_set)`
			`grouped_examples = self.group(predictions, labels, test_set)`

			`details = []`
			`total_pass_num, count = 0, 0`
			`for example_abbr, examples in grouped_examples.items():`
			`detail = {`
			`k: v`
			`for k, v in examples[0].items()`
			`if k not in ['prediction', 'label']`
			`}`
			`detail.update({`
			`'predictions': [{`
			`'prediction': example['prediction'],`
			`'label': example['label']`
			`} for example in examples],`
			`})`

			`current_example_labels = [e['label'] for e in examples]`
			`c = int(np.sum(current_example_labels))`

			`for k in self.k:`
			`for threshold in self.thresholds:`
			`detail[f'G-Pass@{k}_{threshold}'] = compute_g_pass_at_k(`
			`n=self.n, c=c, k=k, t=threshold)`
			`detail[f'mG-Pass@{k}'] = compute_mg_pass_at_k(n=self.n,`
			`c=c,`
			`k=k)`
			`count += self.n`
			`total_pass_num += c`

			`details.append(detail)`

			`return self.reduce(details)`