mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
388 lines
12 KiB
Python
388 lines
12 KiB
Python
import os
|
|
import random
|
|
from typing import List, Optional
|
|
|
|
import evaluate
|
|
import numpy as np
|
|
from datasets import Dataset
|
|
from mmengine.config import ConfigDict
|
|
|
|
from opencompass.registry import ICL_EVALUATORS
|
|
|
|
from .icl_base_evaluator import BaseEvaluator
|
|
|
|
|
|
class HuggingfaceEvaluator(BaseEvaluator):
|
|
"""Use huggingface evaluate module to calculate the target metrics.
|
|
|
|
Args:
|
|
metric (str): Metric name in evaluate module.
|
|
seed (int): There exists some randomness during the calculation of some
|
|
metrics, thus we set a fixed random seed for reproducing. Defaults
|
|
to 0.
|
|
pred_postprocessor (optional): Function or configuration for prediction
|
|
post-processing.
|
|
"""
|
|
|
|
def __init__(self,
|
|
metric: str,
|
|
seed: int = 0,
|
|
pred_postprocessor=None) -> None:
|
|
self.metric = metric
|
|
self.seed = seed
|
|
super().__init__(pred_postprocessor=pred_postprocessor)
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
"""Preprocess the final predictions and references to needed format.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: preprocessed results.
|
|
"""
|
|
return {
|
|
'predictions': self.pred_postprocess(predictions),
|
|
'references': references,
|
|
}
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
return scores
|
|
|
|
def score(self,
|
|
predictions: List,
|
|
references: List,
|
|
test_set=None) -> dict:
|
|
"""Calculate scores.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: calculated scores.
|
|
"""
|
|
random_state = random.getstate()
|
|
np_random_state = np.random.get_state()
|
|
|
|
random.seed(self.seed)
|
|
np.random.seed(self.seed)
|
|
if len(predictions) != len(references):
|
|
return {
|
|
'error':
|
|
'predictions and references have different '
|
|
f'length. len(predictions): {len(predictions)}, '
|
|
f'len(references): {len(references)}'
|
|
}
|
|
# use codes pre-downloaded to opencompass repo, avoid downloading
|
|
local_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
|
'hf_metrics', self.metric + '.py')
|
|
if os.path.exists(local_path):
|
|
metric = evaluate.load(local_path)
|
|
else:
|
|
metric = evaluate.load(self.metric)
|
|
scores = metric.compute(**self._preprocess(predictions, references))
|
|
result = self._postprocess(scores)
|
|
random.setstate(random_state)
|
|
np.random.set_state(np_random_state)
|
|
return result
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class AccEvaluator(HuggingfaceEvaluator):
|
|
"""Accuracy evaluator."""
|
|
|
|
def __init__(self,
|
|
pred_postprocessor: Optional[ConfigDict] = None) -> None:
|
|
super().__init__(metric='accuracy',
|
|
pred_postprocessor=pred_postprocessor)
|
|
|
|
def _preprocess(self,
|
|
predictions: List,
|
|
references: List,
|
|
test_set=None) -> dict:
|
|
"""Preprocess the final predictions and references to needed format.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: preprocessed results.
|
|
"""
|
|
mapping_to_int_dict = {
|
|
label: idx
|
|
for idx, label in enumerate(set(map(str, references)))
|
|
}
|
|
pred_set = set(predictions)
|
|
for pred in pred_set:
|
|
if str(pred) not in mapping_to_int_dict.keys():
|
|
mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict)
|
|
golds = [mapping_to_int_dict[str(gold)] for gold in references]
|
|
preds = [mapping_to_int_dict[str(pred)] for pred in predictions]
|
|
return {
|
|
'predictions': preds,
|
|
'references': golds,
|
|
}
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
scores['accuracy'] *= 100
|
|
return scores
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class AccContaminationEvaluator(AccEvaluator):
|
|
"""Accuracy evaluator."""
|
|
|
|
def score(self, predictions: List, references: List,
|
|
test_set: Dataset) -> dict:
|
|
# group the predictions and references by their contamination status
|
|
clean_predictions, clean_references = [], []
|
|
input_contaminated_predictions, input_contaminated_references = [], []
|
|
input_and_label_contaminated_predictions, \
|
|
input_and_label_contaminated_references = [], []
|
|
for pred, ref, is_clean in zip(predictions, references,
|
|
test_set['is_clean']):
|
|
if is_clean == 'clean':
|
|
clean_predictions.append(pred)
|
|
clean_references.append(ref)
|
|
elif is_clean == 'input contamination':
|
|
input_contaminated_predictions.append(pred)
|
|
input_contaminated_references.append(ref)
|
|
elif is_clean == 'input-and-label contamination':
|
|
input_and_label_contaminated_predictions.append(pred)
|
|
input_and_label_contaminated_references.append(ref)
|
|
clean_results = super().score(clean_predictions, clean_references)
|
|
input_contaminated_results = super().score(
|
|
input_contaminated_predictions, input_contaminated_references)
|
|
input_and_label_contaminated_results = super().score(
|
|
input_and_label_contaminated_predictions,
|
|
input_and_label_contaminated_references)
|
|
|
|
# rename the keys of the results, add 'clean, 'input contaminated',
|
|
# 'input-and-label contaminated' as prefixes
|
|
clean_results = {f'{k} - clean': v for k, v in clean_results.items()}
|
|
input_contaminated_results = {
|
|
f'{k} - input contaminated': v
|
|
for k, v in input_contaminated_results.items()
|
|
}
|
|
input_and_label_contaminated_results = {
|
|
f'{k} - input-and-label contaminated': v
|
|
for k, v in input_and_label_contaminated_results.items()
|
|
}
|
|
return {
|
|
**clean_results,
|
|
**input_contaminated_results,
|
|
**input_and_label_contaminated_results
|
|
}
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class RougeEvaluator(HuggingfaceEvaluator):
|
|
"""Rouge evaluator.
|
|
|
|
Note: this evaluator is not suitable for chinese datasets.
|
|
"""
|
|
|
|
def __init__(self,
|
|
pred_postprocessor: Optional[ConfigDict] = None) -> None:
|
|
super().__init__(metric='rouge', pred_postprocessor=pred_postprocessor)
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
return {k: v * 100 for k, v in scores.items()}
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class BleuEvaluator(HuggingfaceEvaluator):
|
|
"""Bleu evaluator."""
|
|
|
|
def __init__(self,
|
|
pred_postprocessor: Optional[ConfigDict] = None) -> None:
|
|
super().__init__(metric='sacrebleu',
|
|
pred_postprocessor=pred_postprocessor)
|
|
|
|
|
|
class BleuFloresEvaluator(HuggingfaceEvaluator):
|
|
"""Bleu evaluator using flores200 tokenize."""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__(metric='sacrebleu')
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
return {
|
|
'predictions': predictions,
|
|
'references': references,
|
|
'tokenize': 'flores200',
|
|
}
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class MccEvaluator(AccEvaluator):
|
|
"""Matthews correlation evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super(AccEvaluator, self).__init__(metric='matthews_correlation')
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
scores['matthews_correlation'] *= 100
|
|
return scores
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class SquadEvaluator(HuggingfaceEvaluator):
|
|
"""Squad evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__(metric='squad')
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
"""Preprocess the final predictions and references to needed format.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: preprocessed results.
|
|
"""
|
|
p_list = [{
|
|
'prediction_text': pred.split('\n')[0],
|
|
'id': str(i)
|
|
} for i, pred in enumerate(predictions)]
|
|
r_list = [{
|
|
'answers': {
|
|
'answer_start': [0],
|
|
'text': [ref]
|
|
},
|
|
'id': str(i)
|
|
} for i, ref in enumerate(references)]
|
|
return {
|
|
'predictions': p_list,
|
|
'references': r_list,
|
|
}
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
return scores['f1']
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class EDAccEvaluator(AccEvaluator):
|
|
"""Edit distance based accuracy evaluator.
|
|
|
|
This implementation requires the un-postprocessed outputs from the model,
|
|
and the reference list where each item is structured as:
|
|
|
|
.. code-block:: python
|
|
|
|
{
|
|
'candidates': [], # a list of informative answer candidates
|
|
'label': 0, # the index of the gold answer
|
|
}
|
|
|
|
It always matches the model's output to a valid answer with the citerion
|
|
as the minimum editing distance.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
from rapidfuzz.distance import Levenshtein
|
|
self.dist = Levenshtein.distance
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
"""Preprocess the final predictions and references to needed format.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: preprocessed results.
|
|
"""
|
|
|
|
preds = []
|
|
golds = []
|
|
|
|
for i in range(len(predictions)):
|
|
pred, ref = predictions[i], references[i]
|
|
dists = []
|
|
for cands in ref['candidates']:
|
|
if isinstance(cands, str):
|
|
d = self.dist(pred, cands)
|
|
else:
|
|
d = np.min([self.dist(pred, cand) for cand in cands])
|
|
dists.append(d)
|
|
preds.append(np.argmin(dists))
|
|
golds.append(ref['label'])
|
|
|
|
return {
|
|
'predictions': preds,
|
|
'references': golds,
|
|
}
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class AccwithDetailsEvaluator(BaseEvaluator):
|
|
|
|
def score(self, predictions, references, origin_prompt) -> dict:
|
|
|
|
if len(predictions) != len(references):
|
|
return {'error': 'preds and refrs have different length.'}
|
|
|
|
details = {}
|
|
correct, total = 0, 0
|
|
for index, (pred, ref) in enumerate(zip(predictions, references)):
|
|
is_correct = pred == ref
|
|
correct += is_correct
|
|
details[str(index)] = {
|
|
'prompt': origin_prompt[index],
|
|
'pred': pred,
|
|
'refr': ref,
|
|
'is_correct': is_correct,
|
|
}
|
|
total += 1
|
|
|
|
results = {'accuracy': correct / total * 100, 'details': details}
|
|
|
|
return results
|