mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
260 lines
7.4 KiB
Python
260 lines
7.4 KiB
Python
import random
|
|
from typing import List
|
|
|
|
import evaluate
|
|
import numpy as np
|
|
|
|
from opencompass.registry import ICL_EVALUATORS
|
|
|
|
from .icl_base_evaluator import BaseEvaluator
|
|
|
|
|
|
class HuggingfaceEvaluator(BaseEvaluator):
|
|
"""Use huggingface evaluate module to calculate the target metrics.
|
|
|
|
Args:
|
|
metric (str): Metric name in evaluate module.
|
|
seed (int): There exists some randomness during the calculation of some
|
|
metrics, thus we set a fixed random seed for reproducing. Defaults
|
|
to 0.
|
|
"""
|
|
|
|
def __init__(self, metric: str, seed: int = 0) -> None:
|
|
self.metric = metric
|
|
random.seed(seed)
|
|
np.random.seed(seed)
|
|
super().__init__()
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
"""Preprocess the final predictions and references to needed format.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: preprocessed results.
|
|
"""
|
|
return {
|
|
'predictions': predictions,
|
|
'references': references,
|
|
}
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
return scores
|
|
|
|
def score(self, predictions: List, references: List) -> dict:
|
|
"""Calculate scores.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: calculated scores.
|
|
"""
|
|
if len(predictions) != len(references):
|
|
return {
|
|
'error':
|
|
'predictions and references have different '
|
|
f'length. len(predictions): {len(predictions)}, '
|
|
f'len(references): {len(references)}'
|
|
}
|
|
metric = evaluate.load(self.metric)
|
|
scores = metric.compute(**self._preprocess(predictions, references))
|
|
return self._postprocess(scores)
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class AccEvaluator(HuggingfaceEvaluator):
|
|
"""Accuracy evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__(metric='accuracy')
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
"""Preprocess the final predictions and references to needed format.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: preprocessed results.
|
|
"""
|
|
mapping_to_int_dict = {
|
|
label: idx
|
|
for idx, label in enumerate(set(map(str, references)))
|
|
}
|
|
pred_set = set(predictions)
|
|
for pred in pred_set:
|
|
if str(pred) not in mapping_to_int_dict.keys():
|
|
mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict)
|
|
golds = [mapping_to_int_dict[str(gold)] for gold in references]
|
|
preds = [mapping_to_int_dict[str(pred)] for pred in predictions]
|
|
return {
|
|
'predictions': preds,
|
|
'references': golds,
|
|
}
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
scores['accuracy'] *= 100
|
|
return scores
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class RougeEvaluator(HuggingfaceEvaluator):
|
|
"""Rouge evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__(metric='rouge')
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
return {k: v * 100 for k, v in scores.items()}
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class BleuEvaluator(HuggingfaceEvaluator):
|
|
"""Bleu evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__(metric='sacrebleu')
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class MccEvaluator(AccEvaluator):
|
|
"""Matthews correlation evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super(AccEvaluator, self).__init__(metric='matthews_correlation')
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
scores['matthews_correlation'] *= 100
|
|
return scores
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class SquadEvaluator(HuggingfaceEvaluator):
|
|
"""Squad evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__(metric='squad')
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
"""Preprocess the final predictions and references to needed format.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: preprocessed results.
|
|
"""
|
|
p_list = [{
|
|
'prediction_text': pred.split('\n')[0],
|
|
'id': str(i)
|
|
} for i, pred in enumerate(predictions)]
|
|
r_list = [{
|
|
'answers': {
|
|
'answer_start': [0],
|
|
'text': [ref]
|
|
},
|
|
'id': str(i)
|
|
} for i, ref in enumerate(references)]
|
|
return {
|
|
'predictions': p_list,
|
|
'references': r_list,
|
|
}
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
return scores['f1']
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class EDAccEvaluator(AccEvaluator):
|
|
"""Edit distance based accuracy evaluator.
|
|
|
|
This implementation requires the un-postprocessed outputs from the model,
|
|
and the reference list where each item is structured as:
|
|
|
|
.. code-block:: python
|
|
|
|
{
|
|
'candidates': [], # a list of informative answer candidates
|
|
'label': 0, # the index of the gold answer
|
|
}
|
|
|
|
It always matches the model's output to a valid answer with the citerion
|
|
as the minimum editing distance.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
from rapidfuzz.distance import Levenshtein
|
|
self.dist = Levenshtein.distance
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
"""Preprocess the final predictions and references to needed format.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: preprocessed results.
|
|
"""
|
|
|
|
preds = []
|
|
golds = []
|
|
|
|
for i in range(len(predictions)):
|
|
pred, ref = predictions[i], references[i]
|
|
dists = [self.dist(pred, cand) for cand in ref['candidates']]
|
|
preds.append(np.argmin(dists))
|
|
golds.append(ref['label'])
|
|
|
|
return {
|
|
'predictions': preds,
|
|
'references': golds,
|
|
}
|