OpenCompass/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py

from typing import List
import numpy as np
from sklearn.metrics import roc_auc_score

from opencompass.registry import ICL_EVALUATORS

from .icl_base_evaluator import BaseEvaluator


@ICL_EVALUATORS.register_module()
class AUCROCEvaluator(BaseEvaluator):
    """Calculate AUC-ROC scores and accuracy according the prediction.
    
    For some dataset, the accuracy cannot reveal the difference between
    models because of the saturation. AUC-ROC scores can further exam
    model abilities to distinguish different labels. More details can refer to
    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
    """  # noqa

    def __init__(self) -> None:
        super().__init__()

    def score(self, predictions: List, references: List) -> dict:
        """Calculate scores and accuracy.

        Args:
            predictions (List): List of probabilities for each class of each
                sample.
            references (List): List of target labels for each sample.

        Returns:
            dict: calculated scores.
        """
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different length.'
            }
        auc_score = roc_auc_score(references, np.array(predictions)[:, 1])
        accuracy = sum(
            references == np.argmax(predictions, axis=1)) / len(references)
        return dict(auc_score=auc_score * 100, accuracy=accuracy * 100)
Add Release Contraibution 2023-07-05 10:22:40 +08:00			`from typing import List`
			`import numpy as np`
			`from sklearn.metrics import roc_auc_score`

			`from opencompass.registry import ICL_EVALUATORS`

			`from .icl_base_evaluator import BaseEvaluator`


			`@ICL_EVALUATORS.register_module()`
			`class AUCROCEvaluator(BaseEvaluator):`
			`"""Calculate AUC-ROC scores and accuracy according the prediction.`

			`For some dataset, the accuracy cannot reveal the difference between`
			`models because of the saturation. AUC-ROC scores can further exam`
			`model abilities to distinguish different labels. More details can refer to`
			`https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html`
			`""" # noqa`

			`def __init__(self) -> None:`
			`super().__init__()`

			`def score(self, predictions: List, references: List) -> dict:`
			`"""Calculate scores and accuracy.`

			`Args:`
			`predictions (List): List of probabilities for each class of each`
			`sample.`
			`references (List): List of target labels for each sample.`

			`Returns:`
			`dict: calculated scores.`
			`"""`
			`if len(predictions) != len(references):`
			`return {`
			`'error': 'predictions and references have different length.'`
			`}`
			`auc_score = roc_auc_score(references, np.array(predictions)[:, 1])`
			`accuracy = sum(`
			`references == np.argmax(predictions, axis=1)) / len(references)`
			`return dict(auc_score=auc_score * 100, accuracy=accuracy * 100)`