mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
200 lines
5.6 KiB
Python
200 lines
5.6 KiB
Python
from typing import List
|
|
import evaluate
|
|
|
|
from opencompass.registry import ICL_EVALUATORS
|
|
|
|
from .icl_base_evaluator import BaseEvaluator
|
|
|
|
|
|
class HuggingfaceEvaluator(BaseEvaluator):
|
|
"""Use huggingface evaluate module to calculate the target metrics.
|
|
|
|
Args:
|
|
metric (str): Metric name in evaluate module.
|
|
"""
|
|
|
|
def __init__(self, metric: str) -> None:
|
|
self.metric = metric
|
|
super().__init__()
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
"""Preprocess the final predictions and references to needed format.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: preprocessed results.
|
|
"""
|
|
return {
|
|
'predictions': predictions,
|
|
'references': references,
|
|
}
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
return scores
|
|
|
|
def score(self, predictions: List, references: List) -> dict:
|
|
"""Calculate scores.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: calculated scores.
|
|
"""
|
|
if len(predictions) != len(references):
|
|
return {'error': 'predictions and references have different '
|
|
f'length. len(predictions): {len(predictions)}, '
|
|
f'len(references): {len(references)}'}
|
|
metric = evaluate.load(self.metric)
|
|
scores = metric.compute(**self._preprocess(predictions, references))
|
|
return self._postprocess(scores)
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class AccEvaluator(HuggingfaceEvaluator):
|
|
"""Accuracy evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__(metric='accuracy')
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
"""Preprocess the final predictions and references to needed format.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: preprocessed results.
|
|
"""
|
|
mapping_to_int_dict = {
|
|
label: idx
|
|
for idx, label in enumerate(set(map(str, references)))
|
|
}
|
|
pred_set = set(predictions)
|
|
for pred in pred_set:
|
|
if str(pred) not in mapping_to_int_dict.keys():
|
|
mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict)
|
|
golds = [mapping_to_int_dict[str(gold)] for gold in references]
|
|
preds = [mapping_to_int_dict[str(pred)] for pred in predictions]
|
|
return {
|
|
'predictions': preds,
|
|
'references': golds,
|
|
}
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
scores["accuracy"] *= 100
|
|
return scores
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class RougeEvaluator(HuggingfaceEvaluator):
|
|
"""Rouge evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__(metric='rouge')
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
return {k: v * 100 for k, v in scores.items()}
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class BleuEvaluator(HuggingfaceEvaluator):
|
|
"""Bleu evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__(metric='sacrebleu')
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class MccEvaluator(AccEvaluator):
|
|
"""Matthews correlation evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super(AccEvaluator, self).__init__(metric='matthews_correlation')
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
scores["matthews_correlation"] *= 100
|
|
return scores
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class SquadEvaluator(HuggingfaceEvaluator):
|
|
"""Squad evaluator."""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__(metric='squad')
|
|
|
|
def _preprocess(self, predictions: List, references: List) -> dict:
|
|
"""Preprocess the final predictions and references to needed format.
|
|
|
|
Args:
|
|
predictions (List): List of predictions of each sample.
|
|
references (List): List of targets for each sample.
|
|
|
|
Returns:
|
|
dict: preprocessed results.
|
|
"""
|
|
p_list = [{
|
|
'prediction_text': pred.split('\n')[0],
|
|
'id': str(i)
|
|
} for i, pred in enumerate(predictions)]
|
|
r_list = [{
|
|
'answers': {
|
|
'answer_start': [0],
|
|
'text': [ref]
|
|
},
|
|
'id': str(i)
|
|
} for i, ref in enumerate(references)]
|
|
return {
|
|
'predictions': p_list,
|
|
'references': r_list,
|
|
}
|
|
|
|
def _postprocess(self, scores: dict) -> dict:
|
|
"""Postprocess for final scores.
|
|
|
|
Args:
|
|
scores (dict): Dict of calculated scores of metrics.
|
|
|
|
Returns:
|
|
dict: postprocessed scores.
|
|
"""
|
|
return scores['f1']
|