OpenCompass/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py

388 lines
12 KiB
Python
Raw Normal View History

import os
2023-07-28 11:29:01 +08:00
import random
2025-04-25 16:35:28 +08:00
from typing import List, Optional
2023-07-04 21:34:55 +08:00
import evaluate
2023-07-28 11:29:01 +08:00
import numpy as np
from datasets import Dataset
2025-04-25 16:35:28 +08:00
from mmengine.config import ConfigDict
2023-07-04 21:34:55 +08:00
from opencompass.registry import ICL_EVALUATORS
from .icl_base_evaluator import BaseEvaluator
class HuggingfaceEvaluator(BaseEvaluator):
"""Use huggingface evaluate module to calculate the target metrics.
Args:
metric (str): Metric name in evaluate module.
2023-07-28 11:29:01 +08:00
seed (int): There exists some randomness during the calculation of some
metrics, thus we set a fixed random seed for reproducing. Defaults
to 0.
2025-05-06 21:50:39 +08:00
pred_postprocessor (optional): Function or configuration for prediction
post-processing.
2023-07-04 21:34:55 +08:00
"""
2025-04-25 16:35:28 +08:00
def __init__(self,
metric: str,
seed: int = 0,
pred_postprocessor=None) -> None:
2023-07-04 21:34:55 +08:00
self.metric = metric
self.seed = seed
2025-04-25 16:35:28 +08:00
super().__init__(pred_postprocessor=pred_postprocessor)
2023-07-04 21:34:55 +08:00
def _preprocess(self, predictions: List, references: List) -> dict:
"""Preprocess the final predictions and references to needed format.
Args:
predictions (List): List of predictions of each sample.
references (List): List of targets for each sample.
Returns:
dict: preprocessed results.
"""
return {
2025-04-25 16:35:28 +08:00
'predictions': self.pred_postprocess(predictions),
2023-07-04 21:34:55 +08:00
'references': references,
}
def _postprocess(self, scores: dict) -> dict:
"""Postprocess for final scores.
Args:
scores (dict): Dict of calculated scores of metrics.
Returns:
dict: postprocessed scores.
"""
return scores
2025-05-06 14:51:44 +08:00
def score(self,
predictions: List,
references: List,
test_set=None) -> dict:
2023-07-04 21:34:55 +08:00
"""Calculate scores.
Args:
predictions (List): List of predictions of each sample.
references (List): List of targets for each sample.
Returns:
dict: calculated scores.
"""
random_state = random.getstate()
np_random_state = np.random.get_state()
random.seed(self.seed)
np.random.seed(self.seed)
2023-07-04 21:34:55 +08:00
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
2023-07-04 21:34:55 +08:00
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
# use codes pre-downloaded to opencompass repo, avoid downloading
local_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'hf_metrics', self.metric + '.py')
if os.path.exists(local_path):
metric = evaluate.load(local_path)
else:
metric = evaluate.load(self.metric)
2023-07-04 21:34:55 +08:00
scores = metric.compute(**self._preprocess(predictions, references))
result = self._postprocess(scores)
random.setstate(random_state)
np.random.set_state(np_random_state)
return result
2023-07-04 21:34:55 +08:00
@ICL_EVALUATORS.register_module()
class AccEvaluator(HuggingfaceEvaluator):
"""Accuracy evaluator."""
2025-04-25 16:35:28 +08:00
def __init__(self,
pred_postprocessor: Optional[ConfigDict] = None) -> None:
super().__init__(metric='accuracy',
pred_postprocessor=pred_postprocessor)
2023-07-04 21:34:55 +08:00
2025-05-06 14:51:44 +08:00
def _preprocess(self,
predictions: List,
references: List,
test_set=None) -> dict:
2023-07-04 21:34:55 +08:00
"""Preprocess the final predictions and references to needed format.
Args:
predictions (List): List of predictions of each sample.
references (List): List of targets for each sample.
Returns:
dict: preprocessed results.
"""
mapping_to_int_dict = {
label: idx
for idx, label in enumerate(set(map(str, references)))
}
pred_set = set(predictions)
for pred in pred_set:
if str(pred) not in mapping_to_int_dict.keys():
mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict)
golds = [mapping_to_int_dict[str(gold)] for gold in references]
preds = [mapping_to_int_dict[str(pred)] for pred in predictions]
return {
'predictions': preds,
'references': golds,
}
def _postprocess(self, scores: dict) -> dict:
"""Postprocess for final scores.
Args:
scores (dict): Dict of calculated scores of metrics.
Returns:
dict: postprocessed scores.
"""
scores['accuracy'] *= 100
2023-07-04 21:34:55 +08:00
return scores
@ICL_EVALUATORS.register_module()
class AccContaminationEvaluator(AccEvaluator):
"""Accuracy evaluator."""
def score(self, predictions: List, references: List,
test_set: Dataset) -> dict:
# group the predictions and references by their contamination status
clean_predictions, clean_references = [], []
input_contaminated_predictions, input_contaminated_references = [], []
input_and_label_contaminated_predictions, \
input_and_label_contaminated_references = [], []
for pred, ref, is_clean in zip(predictions, references,
test_set['is_clean']):
if is_clean == 'clean':
clean_predictions.append(pred)
clean_references.append(ref)
elif is_clean == 'input contamination':
input_contaminated_predictions.append(pred)
input_contaminated_references.append(ref)
elif is_clean == 'input-and-label contamination':
input_and_label_contaminated_predictions.append(pred)
input_and_label_contaminated_references.append(ref)
clean_results = super().score(clean_predictions, clean_references)
input_contaminated_results = super().score(
input_contaminated_predictions, input_contaminated_references)
input_and_label_contaminated_results = super().score(
input_and_label_contaminated_predictions,
input_and_label_contaminated_references)
# rename the keys of the results, add 'clean, 'input contaminated',
# 'input-and-label contaminated' as prefixes
clean_results = {f'{k} - clean': v for k, v in clean_results.items()}
input_contaminated_results = {
f'{k} - input contaminated': v
for k, v in input_contaminated_results.items()
}
input_and_label_contaminated_results = {
f'{k} - input-and-label contaminated': v
for k, v in input_and_label_contaminated_results.items()
}
return {
**clean_results,
**input_contaminated_results,
**input_and_label_contaminated_results
}
2023-07-04 21:34:55 +08:00
@ICL_EVALUATORS.register_module()
class RougeEvaluator(HuggingfaceEvaluator):
"""Rouge evaluator.
Note: this evaluator is not suitable for chinese datasets.
"""
2023-07-04 21:34:55 +08:00
2025-04-25 16:35:28 +08:00
def __init__(self,
pred_postprocessor: Optional[ConfigDict] = None) -> None:
super().__init__(metric='rouge', pred_postprocessor=pred_postprocessor)
2023-07-04 21:34:55 +08:00
def _postprocess(self, scores: dict) -> dict:
"""Postprocess for final scores.
Args:
scores (dict): Dict of calculated scores of metrics.
Returns:
dict: postprocessed scores.
"""
return {k: v * 100 for k, v in scores.items()}
@ICL_EVALUATORS.register_module()
class BleuEvaluator(HuggingfaceEvaluator):
"""Bleu evaluator."""
2025-04-25 16:35:28 +08:00
def __init__(self,
pred_postprocessor: Optional[ConfigDict] = None) -> None:
super().__init__(metric='sacrebleu',
pred_postprocessor=pred_postprocessor)
2023-07-04 21:34:55 +08:00
class BleuFloresEvaluator(HuggingfaceEvaluator):
"""Bleu evaluator using flores200 tokenize."""
def __init__(self) -> None:
super().__init__(metric='sacrebleu')
def _preprocess(self, predictions: List, references: List) -> dict:
return {
'predictions': predictions,
'references': references,
'tokenize': 'flores200',
}
2023-07-04 21:34:55 +08:00
@ICL_EVALUATORS.register_module()
class MccEvaluator(AccEvaluator):
"""Matthews correlation evaluator."""
def __init__(self) -> None:
super(AccEvaluator, self).__init__(metric='matthews_correlation')
def _postprocess(self, scores: dict) -> dict:
"""Postprocess for final scores.
Args:
scores (dict): Dict of calculated scores of metrics.
Returns:
dict: postprocessed scores.
"""
scores['matthews_correlation'] *= 100
2023-07-04 21:34:55 +08:00
return scores
@ICL_EVALUATORS.register_module()
class SquadEvaluator(HuggingfaceEvaluator):
"""Squad evaluator."""
def __init__(self) -> None:
super().__init__(metric='squad')
def _preprocess(self, predictions: List, references: List) -> dict:
"""Preprocess the final predictions and references to needed format.
Args:
predictions (List): List of predictions of each sample.
references (List): List of targets for each sample.
Returns:
dict: preprocessed results.
"""
p_list = [{
'prediction_text': pred.split('\n')[0],
'id': str(i)
} for i, pred in enumerate(predictions)]
r_list = [{
'answers': {
'answer_start': [0],
'text': [ref]
},
'id': str(i)
} for i, ref in enumerate(references)]
return {
'predictions': p_list,
'references': r_list,
}
def _postprocess(self, scores: dict) -> dict:
"""Postprocess for final scores.
Args:
scores (dict): Dict of calculated scores of metrics.
Returns:
dict: postprocessed scores.
"""
return scores['f1']
@ICL_EVALUATORS.register_module()
class EDAccEvaluator(AccEvaluator):
"""Edit distance based accuracy evaluator.
This implementation requires the un-postprocessed outputs from the model,
and the reference list where each item is structured as:
.. code-block:: python
{
'candidates': [], # a list of informative answer candidates
'label': 0, # the index of the gold answer
}
It always matches the model's output to a valid answer with the citerion
as the minimum editing distance.
"""
def __init__(self) -> None:
super().__init__()
from rapidfuzz.distance import Levenshtein
self.dist = Levenshtein.distance
def _preprocess(self, predictions: List, references: List) -> dict:
"""Preprocess the final predictions and references to needed format.
Args:
predictions (List): List of predictions of each sample.
references (List): List of targets for each sample.
Returns:
dict: preprocessed results.
"""
preds = []
golds = []
for i in range(len(predictions)):
pred, ref = predictions[i], references[i]
2023-11-13 15:15:34 +08:00
dists = []
for cands in ref['candidates']:
if isinstance(cands, str):
d = self.dist(pred, cands)
else:
d = np.min([self.dist(pred, cand) for cand in cands])
dists.append(d)
preds.append(np.argmin(dists))
golds.append(ref['label'])
return {
'predictions': preds,
'references': golds,
}
2024-05-21 14:22:46 +08:00
@ICL_EVALUATORS.register_module()
class AccwithDetailsEvaluator(BaseEvaluator):
def score(self, predictions, references, origin_prompt) -> dict:
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length.'}
details = {}
correct, total = 0, 0
for index, (pred, ref) in enumerate(zip(predictions, references)):
is_correct = pred == ref
correct += is_correct
details[str(index)] = {
'prompt': origin_prompt[index],
'pred': pred,
'refr': ref,
'is_correct': is_correct,
}
total += 1
results = {'accuracy': correct / total * 100, 'details': details}
return results