OpenCompass/opencompass/openicl/icl_evaluator/icl_gpassk_evaluator.py
Junnan Liu 8e8d4f1c64
[Feature] Support G-Pass@k and LiveMathBench (#1772)
* support G-Pass@k and livemathbench

* fix bugs

* fix comments of GPassKEvaluator

* update saved details of GPassKEvaluator

* update saved details of GPassKEvaluator

* fix eval api configs & update openai_api for ease of debugging

* update huggingface path

* fix method name of G-Pass@k

* fix default value of eval_model_name

* refactor G-Pass@k evaluator

* log generation params for each backend

* fix evaluation resume

* add notimplementerror
2024-12-30 16:59:39 +08:00

164 lines
5.2 KiB
Python

from abc import abstractmethod
from typing import Any, Dict, List, Union
import numpy as np
from scipy.stats import hypergeom
from opencompass.registry import ICL_EVALUATORS
from .icl_base_evaluator import BaseEvaluator
def compute_pass_at_k(n, c, k):
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
def _compute_g_pass_at_k(n, c, k, m):
if m > min(c, k) or k > n or c < 0 or n <= 0 or m < 0:
return 0.0
return hypergeom.sf(m - 1, n, c, k)
def compute_g_pass_at_k(n, c, k, t):
m = max(int(np.ceil(k * t)), 1)
return _compute_g_pass_at_k(n, c, k, m)
def compute_mg_pass_at_k(n, c, k):
l, r = int(np.ceil(k * 0.5)), k
mg_pass_at_k = 0.0
for i in range(l + 1, r + 1):
mg_pass_at_k += _compute_g_pass_at_k(n, c, k, i)
mg_pass_at_k = 2 * mg_pass_at_k / k
return mg_pass_at_k
@ICL_EVALUATORS.register_module()
class GPassKEvaluator(BaseEvaluator):
"""Evaluator for computing the G-Pass@k Metric.
This evaluator performs the following steps:
1. Invokes task-specific `preprocess` on predictions to
assign a consistency label to each prediction and its
corresponding reference.
2. Calculates metrics for each input example based on
these labels.
3. Aggregates the overall metrics through a task-specific
`postprocess`.
Args:
k (int or list of int): Number of predictions to be
considered in G-Pass@k. It can be a single integer
(e.g., `k=16` computes G-Pass@16) or a list of
integers (e.g., `[4, 8, 16]` computes G-Pass@4,
G-Pass@8, and G-Pass@16).
replication (int): Controls the number of generations
used to estimate G-Pass@k. The total number of
generations is determined by multiplying the
maximum of `k` with `replication`. This parameter
should be a single integer.
thresholds (list of float): A list of floating-point
numbers that define the thresholds for the G-Pass@k
metric.
"""
def __init__(
self,
k: Union[int, List[int]] = 16,
replication: int = 3,
thresholds: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]) -> None:
super().__init__()
if isinstance(k, int):
k = [k]
self.k = k
self.replication = replication
self.n = max(k) * replication
self.thresholds = thresholds
@property
def output_dir(self):
# please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
return self._out_dir
@abstractmethod
def preprocess(self, predictions, references, test_set) -> None:
"""Perform operations on predictions before computing metrics, for
example, do answer_extraction and model_judge in mathematical reasoning
task.
Return:
labels: A list contains the label which indicates whether
prediction is consistency with reference at each position.
"""
raise NotImplementedError
@abstractmethod
def group(self, predictions, labels, test_set) -> Dict[str, Any]:
"""Group the predictions and references.
Return:
A dict contains the grouped predictions and references.
"""
raise NotImplementedError
@abstractmethod
def reduce(self, details) -> Dict[str, Any]:
"""Aggregate the overall metrics.
Return:
A dict contains overall metrics, like:
{'details': details for each example, 'G-Pass@16': xxx}
"""
raise NotImplementedError
def score(self, predictions, references, test_set) -> Dict[str, Any]:
"""Compute G-Pass@k metrics.
Return:
A dict contains metrics for each dataset sample and
overall metrics reduced by `self.reduce`, like:
{'details': details for each example, 'G-Pass@16': xxx}
"""
labels = self.preprocess(predictions, references, test_set)
grouped_examples = self.group(predictions, labels, test_set)
details = []
total_pass_num, count = 0, 0
for example_abbr, examples in grouped_examples.items():
detail = {
k: v
for k, v in examples[0].items()
if k not in ['prediction', 'label']
}
detail.update({
'predictions': [{
'prediction': example['prediction'],
'label': example['label']
} for example in examples],
})
current_example_labels = [e['label'] for e in examples]
c = int(np.sum(current_example_labels))
for k in self.k:
for threshold in self.thresholds:
detail[f'G-Pass@{k}_{threshold}'] = compute_g_pass_at_k(
n=self.n, c=c, k=k, t=threshold)
detail[f'mG-Pass@{k}'] = compute_mg_pass_at_k(n=self.n,
c=c,
k=k)
count += self.n
total_pass_num += c
details.append(detail)
return self.reduce(details)