mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
delete gpassk_evaluator and fix potential errors
This commit is contained in:
parent
6d5a996deb
commit
2349fcff2c
@ -41,10 +41,7 @@ livemathbench_dataset = dict(
|
||||
url=[],
|
||||
use_extract_model=False,
|
||||
extract_url=[],
|
||||
extract_model_name='',
|
||||
k=[4, 8, 16],
|
||||
repeat=3,
|
||||
thresholds=[0.0, 0.25, 0.5, 0.75, 1.0]
|
||||
extract_model_name=''
|
||||
)
|
||||
)
|
||||
)
|
||||
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .livemathbench_greedy_gen_efb20d import livemathbench_datasets # noqa: F401, F403
|
||||
from .livemathbench_greedy_gen_9befbf import livemathbench_datasets # noqa: F401, F403
|
@ -41,10 +41,7 @@ livemathbench_dataset = dict(
|
||||
url=[],
|
||||
use_extract_model=False,
|
||||
extract_url=[],
|
||||
extract_model_name='',
|
||||
k=[1],
|
||||
repeat=1,
|
||||
thresholds=[0.0]
|
||||
extract_model_name=''
|
||||
)
|
||||
)
|
||||
)
|
||||
|
@ -1,10 +1,9 @@
|
||||
import os
|
||||
import warnings
|
||||
from collections import OrderedDict
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from functools import partial
|
||||
from itertools import product
|
||||
from typing import Any, Callable, Dict, List, Union
|
||||
from typing import Any, Callable, Dict, List
|
||||
|
||||
import jsonlines
|
||||
import mmengine
|
||||
@ -13,7 +12,7 @@ from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.datasets.math import MATHAgentEvaluator, math_postprocess_v2
|
||||
from opencompass.models import OpenAISDK
|
||||
from opencompass.openicl.icl_evaluator import GPassKEvaluator
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.openicl.icl_inferencer.icl_base_inferencer import \
|
||||
dump_results_dict
|
||||
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS
|
||||
@ -107,7 +106,7 @@ class LiveMathBenchDataset(BaseDataset):
|
||||
|
||||
|
||||
@ICL_EVALUATORS.register_module()
|
||||
class LiveMathBenchEvaluator(GPassKEvaluator):
|
||||
class LiveMathBenchEvaluator(BaseEvaluator):
|
||||
api_meta_template = dict(round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
@ -118,11 +117,8 @@ class LiveMathBenchEvaluator(GPassKEvaluator):
|
||||
url,
|
||||
use_extract_model=False,
|
||||
extract_url=[],
|
||||
extract_model_name='',
|
||||
k: Union[int, List[int]] = 16,
|
||||
repeat: int = 3,
|
||||
thresholds: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]):
|
||||
super().__init__(k, repeat, thresholds)
|
||||
extract_model_name=''):
|
||||
super().__init__()
|
||||
|
||||
if isinstance(url, str):
|
||||
url = [url]
|
||||
@ -303,55 +299,18 @@ class LiveMathBenchEvaluator(GPassKEvaluator):
|
||||
def preprocess(self, predictions, references, test_set):
|
||||
return self.judge(predictions, references, test_set)
|
||||
|
||||
def group(self, predictions, labels, test_set):
|
||||
example2replications = {}
|
||||
for example, label, prediction in zip(test_set, labels, predictions):
|
||||
example_abbr = f"{example['subdivision']}_{example['idx']}"
|
||||
if example_abbr not in example2replications:
|
||||
example2replications[example_abbr] = []
|
||||
example.update({'prediction': prediction, 'label': label})
|
||||
example2replications[example_abbr].append(example)
|
||||
for _, replications in example2replications.items():
|
||||
assert len(replications) == self.n, print(len(replications),
|
||||
self.n)
|
||||
return example2replications
|
||||
def score(self, predictions, references, test_set) -> Dict[str, Any]:
|
||||
labels = self.preprocess(predictions, references, test_set)
|
||||
results = {'accuracy': 100 * np.mean(labels), 'details': []}
|
||||
|
||||
def reduce(self, details) -> Dict[str, Any]:
|
||||
"""Aggregate the overall metrics.
|
||||
for pred, ref, label in zip(predictions, references, labels):
|
||||
results['details'].append({
|
||||
'pred': pred,
|
||||
'ref': ref,
|
||||
'correct': label
|
||||
})
|
||||
|
||||
Return:
|
||||
A dict contains overall metrics, like:
|
||||
{'details': details for each example, 'G-Pass@16': xxx}
|
||||
"""
|
||||
g_passk_details = OrderedDict()
|
||||
g_passk_details['details'] = details
|
||||
|
||||
all_dataset = set([detail['subdivision'] for detail in details])
|
||||
|
||||
for k in self.k:
|
||||
for subdivision in sorted(list(all_dataset)):
|
||||
for threshold in self.thresholds:
|
||||
g_passk_details[
|
||||
f'{subdivision}/G-Pass@{k}_{threshold}'] = \
|
||||
100. * np.mean(
|
||||
[
|
||||
detail[f'G-Pass@{k}_{threshold}']
|
||||
for detail in details
|
||||
if detail['subdivision'] == subdivision
|
||||
])
|
||||
g_passk_details[f'{subdivision}/mG-Pass@{k}'] = 100. * np.mean(
|
||||
[
|
||||
detail[f'mG-Pass@{k}'] for detail in details
|
||||
if detail['subdivision'] == subdivision
|
||||
])
|
||||
|
||||
for threshold in self.thresholds:
|
||||
g_passk_details[f'G-Pass@{k}_{threshold}'] = 100. * np.mean(
|
||||
[detail[f'G-Pass@{k}_{threshold}'] for detail in details])
|
||||
g_passk_details[f'mG-Pass@{k}'] = 100. * np.mean(
|
||||
[detail[f'mG-Pass@{k}'] for detail in details])
|
||||
|
||||
return g_passk_details
|
||||
return results
|
||||
|
||||
|
||||
class LiveMathBenchOutputHandler:
|
||||
|
@ -4,7 +4,6 @@ from .icl_base_evaluator import BaseEvaluator # noqa
|
||||
from .icl_bpc_evaluator import BPCEvaluator # noqa
|
||||
from .icl_circular_evaluator import CircularEvaluator # noqa
|
||||
from .icl_em_evaluator import EMEvaluator # noqa
|
||||
from .icl_gpassk_evaluator import GPassKEvaluator # noqa
|
||||
from .icl_hf_evaluator import * # noqa
|
||||
from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa
|
||||
from .icl_misc_evaluator import AverageInferencePPLEvaluator # noqa
|
||||
|
@ -41,6 +41,11 @@ class BaseEvaluator:
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
@property
|
||||
def output_dir(self):
|
||||
# please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
|
||||
return self._out_dir
|
||||
|
||||
def group(self, n: int, details: List[Dict[str, Any]],
|
||||
test_set: Dataset) -> Dict[str, Any]:
|
||||
example2replications = {}
|
||||
@ -77,18 +82,24 @@ class BaseEvaluator:
|
||||
return g_passk_details
|
||||
|
||||
def evaluate(self, k: Union[int, List[int]], repeat: int,
|
||||
test_set: Dataset, **score_kwargs):
|
||||
original_dataset: Dataset, **score_kwargs):
|
||||
n = (max(k) if isinstance(k, List) else k) * repeat
|
||||
print(len(score_kwargs['predictions']))
|
||||
real_size = len(test_set) // n
|
||||
real_size = len(original_dataset) // n
|
||||
all_details = []
|
||||
all_results = []
|
||||
for i in range(n):
|
||||
|
||||
def select_fn(i, real_size, x):
|
||||
if isinstance(x, Dataset):
|
||||
return x.select(range(i * real_size, (i + 1) * real_size))
|
||||
elif isinstance(x, Iterable):
|
||||
return x[i * real_size:(i + 1) * real_size]
|
||||
else:
|
||||
return x
|
||||
|
||||
results = self.score(
|
||||
**{
|
||||
key:
|
||||
value[i * real_size:(i + 1) *
|
||||
real_size] if isinstance(value, Iterable) else value
|
||||
key: select_fn(i, real_size, value)
|
||||
for key, value in score_kwargs.items()
|
||||
})
|
||||
details = results.pop('details', None)
|
||||
@ -118,7 +129,7 @@ class BaseEvaluator:
|
||||
else:
|
||||
eval_results[key] = eval_results[key][0]
|
||||
|
||||
grouped_examples = self.group(n, all_details, test_set)
|
||||
grouped_examples = self.group(n, all_details, original_dataset)
|
||||
can_calculate = False
|
||||
if len(all_details) != 0:
|
||||
eval_details = []
|
||||
|
@ -1,163 +0,0 @@
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
import numpy as np
|
||||
from scipy.stats import hypergeom
|
||||
|
||||
from opencompass.registry import ICL_EVALUATORS
|
||||
|
||||
from .icl_base_evaluator import BaseEvaluator
|
||||
|
||||
|
||||
def compute_pass_at_k(n, c, k):
|
||||
if n - c < k:
|
||||
return 1.0
|
||||
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
||||
|
||||
|
||||
def _compute_g_pass_at_k(n, c, k, m):
|
||||
if m > min(c, k) or k > n or c < 0 or n <= 0 or m < 0:
|
||||
return 0.0
|
||||
return hypergeom.sf(m - 1, n, c, k)
|
||||
|
||||
|
||||
def compute_g_pass_at_k(n, c, k, t):
|
||||
m = max(int(np.ceil(k * t)), 1)
|
||||
return _compute_g_pass_at_k(n, c, k, m)
|
||||
|
||||
|
||||
def compute_mg_pass_at_k(n, c, k):
|
||||
l, r = int(np.ceil(k * 0.5)), k
|
||||
|
||||
mg_pass_at_k = 0.0
|
||||
for i in range(l + 1, r + 1):
|
||||
mg_pass_at_k += _compute_g_pass_at_k(n, c, k, i)
|
||||
mg_pass_at_k = 2 * mg_pass_at_k / k
|
||||
|
||||
return mg_pass_at_k
|
||||
|
||||
|
||||
@ICL_EVALUATORS.register_module()
|
||||
class GPassKEvaluator(BaseEvaluator):
|
||||
"""Evaluator for computing the G-Pass@k Metric.
|
||||
|
||||
This evaluator performs the following steps:
|
||||
1. Invokes task-specific `preprocess` on predictions to
|
||||
assign a consistency label to each prediction and its
|
||||
corresponding reference.
|
||||
2. Calculates metrics for each input example based on
|
||||
these labels.
|
||||
3. Aggregates the overall metrics through a task-specific
|
||||
`postprocess`.
|
||||
|
||||
Args:
|
||||
k (int or list of int): Number of predictions to be
|
||||
considered in G-Pass@k. It can be a single integer
|
||||
(e.g., `k=16` computes G-Pass@16) or a list of
|
||||
integers (e.g., `[4, 8, 16]` computes G-Pass@4,
|
||||
G-Pass@8, and G-Pass@16).
|
||||
|
||||
repeat (int): Controls the number of generations
|
||||
used to estimate G-Pass@k. The total number of
|
||||
generations is determined by multiplying the
|
||||
maximum of `k` with `repeat`. This parameter
|
||||
should be a single integer.
|
||||
|
||||
thresholds (list of float): A list of floating-point
|
||||
numbers that define the thresholds for the G-Pass@k
|
||||
metric.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
k: Union[int, List[int]] = 16,
|
||||
repeat: int = 3,
|
||||
thresholds: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]) -> None:
|
||||
super().__init__()
|
||||
|
||||
if isinstance(k, int):
|
||||
k = [k]
|
||||
|
||||
self.k = k
|
||||
self.repeat = repeat
|
||||
self.n = max(k) * repeat
|
||||
self.thresholds = thresholds
|
||||
|
||||
@property
|
||||
def output_dir(self):
|
||||
# please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
|
||||
return self._out_dir
|
||||
|
||||
@abstractmethod
|
||||
def preprocess(self, predictions, references, test_set) -> None:
|
||||
"""Perform operations on predictions before computing metrics, for
|
||||
example, do answer_extraction and model_judge in mathematical reasoning
|
||||
task.
|
||||
|
||||
Return:
|
||||
labels: A list contains the label which indicates whether
|
||||
prediction is consistency with reference at each position.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def group(self, predictions, labels, test_set) -> Dict[str, Any]:
|
||||
"""Group the predictions and references.
|
||||
|
||||
Return:
|
||||
A dict contains the grouped predictions and references.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def reduce(self, details) -> Dict[str, Any]:
|
||||
"""Aggregate the overall metrics.
|
||||
|
||||
Return:
|
||||
A dict contains overall metrics, like:
|
||||
{'details': details for each example, 'G-Pass@16': xxx}
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def score(self, predictions, references, test_set) -> Dict[str, Any]:
|
||||
"""Compute G-Pass@k metrics.
|
||||
|
||||
Return:
|
||||
A dict contains metrics for each dataset sample and
|
||||
overall metrics reduced by `self.reduce`, like:
|
||||
{'details': details for each example, 'G-Pass@16': xxx}
|
||||
"""
|
||||
labels = self.preprocess(predictions, references, test_set)
|
||||
grouped_examples = self.group(predictions, labels, test_set)
|
||||
|
||||
details = []
|
||||
total_pass_num, count = 0, 0
|
||||
for example_abbr, examples in grouped_examples.items():
|
||||
detail = {
|
||||
k: v
|
||||
for k, v in examples[0].items()
|
||||
if k not in ['prediction', 'label']
|
||||
}
|
||||
detail.update({
|
||||
'predictions': [{
|
||||
'prediction': example['prediction'],
|
||||
'label': example['label']
|
||||
} for example in examples],
|
||||
})
|
||||
|
||||
current_example_labels = [e['label'] for e in examples]
|
||||
c = int(np.sum(current_example_labels))
|
||||
|
||||
for k in self.k:
|
||||
for threshold in self.thresholds:
|
||||
detail[f'G-Pass@{k}_{threshold}'] = compute_g_pass_at_k(
|
||||
n=self.n, c=c, k=k, t=threshold)
|
||||
detail[f'mG-Pass@{k}'] = compute_mg_pass_at_k(n=self.n,
|
||||
c=c,
|
||||
k=k)
|
||||
count += self.n
|
||||
total_pass_num += c
|
||||
|
||||
details.append(detail)
|
||||
|
||||
return self.reduce(details)
|
@ -217,7 +217,8 @@ class OpenICLEvalTask(BaseTask):
|
||||
}
|
||||
k = self.dataset_cfg.get('k', 1)
|
||||
repeat = self.dataset_cfg.get('repeat', 1)
|
||||
result = icl_evaluator.evaluate(k, repeat, test_set, **preds)
|
||||
result = icl_evaluator.evaluate(k, repeat, copy.deepcopy(test_set),
|
||||
**preds)
|
||||
|
||||
# Get model postprocess result
|
||||
model_details = None
|
||||
@ -225,7 +226,8 @@ class OpenICLEvalTask(BaseTask):
|
||||
if 'model_postprocessor' in self.eval_cfg:
|
||||
model_preds = copy.deepcopy(preds)
|
||||
model_preds['predictions'] = model_pred_strs
|
||||
model_result = icl_evaluator.evaluate(k, repeat, test_set,
|
||||
model_result = icl_evaluator.evaluate(k, repeat,
|
||||
copy.deepcopy(test_set),
|
||||
**model_preds)
|
||||
for key in model_result:
|
||||
if key == 'details':
|
||||
|
Loading…
Reference in New Issue
Block a user