delete gpassk_evaluator and fix potential errors

This commit is contained in:
jnanliu 2025-02-24 06:25:17 +00:00
parent 6d5a996deb
commit 2349fcff2c
8 changed files with 40 additions and 238 deletions

View File

@ -41,10 +41,7 @@ livemathbench_dataset = dict(
url=[], url=[],
use_extract_model=False, use_extract_model=False,
extract_url=[], extract_url=[],
extract_model_name='', extract_model_name=''
k=[4, 8, 16],
repeat=3,
thresholds=[0.0, 0.25, 0.5, 0.75, 1.0]
) )
) )
) )

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .livemathbench_greedy_gen_efb20d import livemathbench_datasets # noqa: F401, F403 from .livemathbench_greedy_gen_9befbf import livemathbench_datasets # noqa: F401, F403

View File

@ -41,10 +41,7 @@ livemathbench_dataset = dict(
url=[], url=[],
use_extract_model=False, use_extract_model=False,
extract_url=[], extract_url=[],
extract_model_name='', extract_model_name=''
k=[1],
repeat=1,
thresholds=[0.0]
) )
) )
) )

View File

@ -1,10 +1,9 @@
import os import os
import warnings import warnings
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial from functools import partial
from itertools import product from itertools import product
from typing import Any, Callable, Dict, List, Union from typing import Any, Callable, Dict, List
import jsonlines import jsonlines
import mmengine import mmengine
@ -13,7 +12,7 @@ from datasets import Dataset, load_dataset
from opencompass.datasets.math import MATHAgentEvaluator, math_postprocess_v2 from opencompass.datasets.math import MATHAgentEvaluator, math_postprocess_v2
from opencompass.models import OpenAISDK from opencompass.models import OpenAISDK
from opencompass.openicl.icl_evaluator import GPassKEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.openicl.icl_inferencer.icl_base_inferencer import \ from opencompass.openicl.icl_inferencer.icl_base_inferencer import \
dump_results_dict dump_results_dict
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS
@ -107,7 +106,7 @@ class LiveMathBenchDataset(BaseDataset):
@ICL_EVALUATORS.register_module() @ICL_EVALUATORS.register_module()
class LiveMathBenchEvaluator(GPassKEvaluator): class LiveMathBenchEvaluator(BaseEvaluator):
api_meta_template = dict(round=[ api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'), dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True), dict(role='BOT', api_role='BOT', generate=True),
@ -118,11 +117,8 @@ class LiveMathBenchEvaluator(GPassKEvaluator):
url, url,
use_extract_model=False, use_extract_model=False,
extract_url=[], extract_url=[],
extract_model_name='', extract_model_name=''):
k: Union[int, List[int]] = 16, super().__init__()
repeat: int = 3,
thresholds: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]):
super().__init__(k, repeat, thresholds)
if isinstance(url, str): if isinstance(url, str):
url = [url] url = [url]
@ -303,55 +299,18 @@ class LiveMathBenchEvaluator(GPassKEvaluator):
def preprocess(self, predictions, references, test_set): def preprocess(self, predictions, references, test_set):
return self.judge(predictions, references, test_set) return self.judge(predictions, references, test_set)
def group(self, predictions, labels, test_set): def score(self, predictions, references, test_set) -> Dict[str, Any]:
example2replications = {} labels = self.preprocess(predictions, references, test_set)
for example, label, prediction in zip(test_set, labels, predictions): results = {'accuracy': 100 * np.mean(labels), 'details': []}
example_abbr = f"{example['subdivision']}_{example['idx']}"
if example_abbr not in example2replications:
example2replications[example_abbr] = []
example.update({'prediction': prediction, 'label': label})
example2replications[example_abbr].append(example)
for _, replications in example2replications.items():
assert len(replications) == self.n, print(len(replications),
self.n)
return example2replications
def reduce(self, details) -> Dict[str, Any]: for pred, ref, label in zip(predictions, references, labels):
"""Aggregate the overall metrics. results['details'].append({
'pred': pred,
'ref': ref,
'correct': label
})
Return: return results
A dict contains overall metrics, like:
{'details': details for each example, 'G-Pass@16': xxx}
"""
g_passk_details = OrderedDict()
g_passk_details['details'] = details
all_dataset = set([detail['subdivision'] for detail in details])
for k in self.k:
for subdivision in sorted(list(all_dataset)):
for threshold in self.thresholds:
g_passk_details[
f'{subdivision}/G-Pass@{k}_{threshold}'] = \
100. * np.mean(
[
detail[f'G-Pass@{k}_{threshold}']
for detail in details
if detail['subdivision'] == subdivision
])
g_passk_details[f'{subdivision}/mG-Pass@{k}'] = 100. * np.mean(
[
detail[f'mG-Pass@{k}'] for detail in details
if detail['subdivision'] == subdivision
])
for threshold in self.thresholds:
g_passk_details[f'G-Pass@{k}_{threshold}'] = 100. * np.mean(
[detail[f'G-Pass@{k}_{threshold}'] for detail in details])
g_passk_details[f'mG-Pass@{k}'] = 100. * np.mean(
[detail[f'mG-Pass@{k}'] for detail in details])
return g_passk_details
class LiveMathBenchOutputHandler: class LiveMathBenchOutputHandler:

View File

@ -4,7 +4,6 @@ from .icl_base_evaluator import BaseEvaluator # noqa
from .icl_bpc_evaluator import BPCEvaluator # noqa from .icl_bpc_evaluator import BPCEvaluator # noqa
from .icl_circular_evaluator import CircularEvaluator # noqa from .icl_circular_evaluator import CircularEvaluator # noqa
from .icl_em_evaluator import EMEvaluator # noqa from .icl_em_evaluator import EMEvaluator # noqa
from .icl_gpassk_evaluator import GPassKEvaluator # noqa
from .icl_hf_evaluator import * # noqa from .icl_hf_evaluator import * # noqa
from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa
from .icl_misc_evaluator import AverageInferencePPLEvaluator # noqa from .icl_misc_evaluator import AverageInferencePPLEvaluator # noqa

View File

@ -41,6 +41,11 @@ class BaseEvaluator:
def __init__(self) -> None: def __init__(self) -> None:
pass pass
@property
def output_dir(self):
# please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
return self._out_dir
def group(self, n: int, details: List[Dict[str, Any]], def group(self, n: int, details: List[Dict[str, Any]],
test_set: Dataset) -> Dict[str, Any]: test_set: Dataset) -> Dict[str, Any]:
example2replications = {} example2replications = {}
@ -77,18 +82,24 @@ class BaseEvaluator:
return g_passk_details return g_passk_details
def evaluate(self, k: Union[int, List[int]], repeat: int, def evaluate(self, k: Union[int, List[int]], repeat: int,
test_set: Dataset, **score_kwargs): original_dataset: Dataset, **score_kwargs):
n = (max(k) if isinstance(k, List) else k) * repeat n = (max(k) if isinstance(k, List) else k) * repeat
print(len(score_kwargs['predictions'])) real_size = len(original_dataset) // n
real_size = len(test_set) // n
all_details = [] all_details = []
all_results = [] all_results = []
for i in range(n): for i in range(n):
def select_fn(i, real_size, x):
if isinstance(x, Dataset):
return x.select(range(i * real_size, (i + 1) * real_size))
elif isinstance(x, Iterable):
return x[i * real_size:(i + 1) * real_size]
else:
return x
results = self.score( results = self.score(
**{ **{
key: key: select_fn(i, real_size, value)
value[i * real_size:(i + 1) *
real_size] if isinstance(value, Iterable) else value
for key, value in score_kwargs.items() for key, value in score_kwargs.items()
}) })
details = results.pop('details', None) details = results.pop('details', None)
@ -118,7 +129,7 @@ class BaseEvaluator:
else: else:
eval_results[key] = eval_results[key][0] eval_results[key] = eval_results[key][0]
grouped_examples = self.group(n, all_details, test_set) grouped_examples = self.group(n, all_details, original_dataset)
can_calculate = False can_calculate = False
if len(all_details) != 0: if len(all_details) != 0:
eval_details = [] eval_details = []

View File

@ -1,163 +0,0 @@
from abc import abstractmethod
from typing import Any, Dict, List, Union
import numpy as np
from scipy.stats import hypergeom
from opencompass.registry import ICL_EVALUATORS
from .icl_base_evaluator import BaseEvaluator
def compute_pass_at_k(n, c, k):
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
def _compute_g_pass_at_k(n, c, k, m):
if m > min(c, k) or k > n or c < 0 or n <= 0 or m < 0:
return 0.0
return hypergeom.sf(m - 1, n, c, k)
def compute_g_pass_at_k(n, c, k, t):
m = max(int(np.ceil(k * t)), 1)
return _compute_g_pass_at_k(n, c, k, m)
def compute_mg_pass_at_k(n, c, k):
l, r = int(np.ceil(k * 0.5)), k
mg_pass_at_k = 0.0
for i in range(l + 1, r + 1):
mg_pass_at_k += _compute_g_pass_at_k(n, c, k, i)
mg_pass_at_k = 2 * mg_pass_at_k / k
return mg_pass_at_k
@ICL_EVALUATORS.register_module()
class GPassKEvaluator(BaseEvaluator):
"""Evaluator for computing the G-Pass@k Metric.
This evaluator performs the following steps:
1. Invokes task-specific `preprocess` on predictions to
assign a consistency label to each prediction and its
corresponding reference.
2. Calculates metrics for each input example based on
these labels.
3. Aggregates the overall metrics through a task-specific
`postprocess`.
Args:
k (int or list of int): Number of predictions to be
considered in G-Pass@k. It can be a single integer
(e.g., `k=16` computes G-Pass@16) or a list of
integers (e.g., `[4, 8, 16]` computes G-Pass@4,
G-Pass@8, and G-Pass@16).
repeat (int): Controls the number of generations
used to estimate G-Pass@k. The total number of
generations is determined by multiplying the
maximum of `k` with `repeat`. This parameter
should be a single integer.
thresholds (list of float): A list of floating-point
numbers that define the thresholds for the G-Pass@k
metric.
"""
def __init__(
self,
k: Union[int, List[int]] = 16,
repeat: int = 3,
thresholds: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]) -> None:
super().__init__()
if isinstance(k, int):
k = [k]
self.k = k
self.repeat = repeat
self.n = max(k) * repeat
self.thresholds = thresholds
@property
def output_dir(self):
# please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
return self._out_dir
@abstractmethod
def preprocess(self, predictions, references, test_set) -> None:
"""Perform operations on predictions before computing metrics, for
example, do answer_extraction and model_judge in mathematical reasoning
task.
Return:
labels: A list contains the label which indicates whether
prediction is consistency with reference at each position.
"""
raise NotImplementedError
@abstractmethod
def group(self, predictions, labels, test_set) -> Dict[str, Any]:
"""Group the predictions and references.
Return:
A dict contains the grouped predictions and references.
"""
raise NotImplementedError
@abstractmethod
def reduce(self, details) -> Dict[str, Any]:
"""Aggregate the overall metrics.
Return:
A dict contains overall metrics, like:
{'details': details for each example, 'G-Pass@16': xxx}
"""
raise NotImplementedError
def score(self, predictions, references, test_set) -> Dict[str, Any]:
"""Compute G-Pass@k metrics.
Return:
A dict contains metrics for each dataset sample and
overall metrics reduced by `self.reduce`, like:
{'details': details for each example, 'G-Pass@16': xxx}
"""
labels = self.preprocess(predictions, references, test_set)
grouped_examples = self.group(predictions, labels, test_set)
details = []
total_pass_num, count = 0, 0
for example_abbr, examples in grouped_examples.items():
detail = {
k: v
for k, v in examples[0].items()
if k not in ['prediction', 'label']
}
detail.update({
'predictions': [{
'prediction': example['prediction'],
'label': example['label']
} for example in examples],
})
current_example_labels = [e['label'] for e in examples]
c = int(np.sum(current_example_labels))
for k in self.k:
for threshold in self.thresholds:
detail[f'G-Pass@{k}_{threshold}'] = compute_g_pass_at_k(
n=self.n, c=c, k=k, t=threshold)
detail[f'mG-Pass@{k}'] = compute_mg_pass_at_k(n=self.n,
c=c,
k=k)
count += self.n
total_pass_num += c
details.append(detail)
return self.reduce(details)

View File

@ -217,7 +217,8 @@ class OpenICLEvalTask(BaseTask):
} }
k = self.dataset_cfg.get('k', 1) k = self.dataset_cfg.get('k', 1)
repeat = self.dataset_cfg.get('repeat', 1) repeat = self.dataset_cfg.get('repeat', 1)
result = icl_evaluator.evaluate(k, repeat, test_set, **preds) result = icl_evaluator.evaluate(k, repeat, copy.deepcopy(test_set),
**preds)
# Get model postprocess result # Get model postprocess result
model_details = None model_details = None
@ -225,7 +226,8 @@ class OpenICLEvalTask(BaseTask):
if 'model_postprocessor' in self.eval_cfg: if 'model_postprocessor' in self.eval_cfg:
model_preds = copy.deepcopy(preds) model_preds = copy.deepcopy(preds)
model_preds['predictions'] = model_pred_strs model_preds['predictions'] = model_pred_strs
model_result = icl_evaluator.evaluate(k, repeat, test_set, model_result = icl_evaluator.evaluate(k, repeat,
copy.deepcopy(test_set),
**model_preds) **model_preds)
for key in model_result: for key in model_result:
if key == 'details': if key == 'details':