OpenCompass/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py

import os
import random
from typing import List

import evaluate
import numpy as np

from opencompass.registry import ICL_EVALUATORS

from .icl_base_evaluator import BaseEvaluator


class HuggingfaceEvaluator(BaseEvaluator):
    """Use huggingface evaluate module to calculate the target metrics.

    Args:
        metric (str): Metric name in evaluate module.
        seed (int): There exists some randomness during the calculation of some
            metrics, thus we set a fixed random seed for reproducing. Defaults
            to 0.
    """

    def __init__(self, metric: str, seed: int = 0) -> None:
        self.metric = metric
        self.seed = seed
        super().__init__()

    def _preprocess(self, predictions: List, references: List) -> dict:
        """Preprocess the final predictions and references to needed format.

        Args:
            predictions (List): List of predictions of each sample.
            references (List): List of targets for each sample.

        Returns:
            dict: preprocessed results.
        """
        return {
            'predictions': predictions,
            'references': references,
        }

    def _postprocess(self, scores: dict) -> dict:
        """Postprocess for final scores.

        Args:
            scores (dict): Dict of calculated scores of metrics.

        Returns:
            dict: postprocessed scores.
        """
        return scores

    def score(self, predictions: List, references: List) -> dict:
        """Calculate scores.

        Args:
            predictions (List): List of predictions of each sample.
            references (List): List of targets for each sample.

        Returns:
            dict: calculated scores.
        """
        random_state = random.getstate()
        np_random_state = np.random.get_state()

        random.seed(self.seed)
        np.random.seed(self.seed)
        if len(predictions) != len(references):
            return {
                'error':
                'predictions and references have different '
                f'length. len(predictions): {len(predictions)}, '
                f'len(references): {len(references)}'
            }
        # use codes pre-downloaded to opencompass repo, avoid downloading
        local_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                  'hf_metrics', self.metric + '.py')
        if os.path.exists(local_path):
            metric = evaluate.load(local_path)
        else:
            metric = evaluate.load(self.metric)
        scores = metric.compute(**self._preprocess(predictions, references))
        result = self._postprocess(scores)
        random.setstate(random_state)
        np.random.set_state(np_random_state)
        return result


@ICL_EVALUATORS.register_module()
class AccEvaluator(HuggingfaceEvaluator):
    """Accuracy evaluator."""

    def __init__(self) -> None:
        super().__init__(metric='accuracy')

    def _preprocess(self, predictions: List, references: List) -> dict:
        """Preprocess the final predictions and references to needed format.

        Args:
            predictions (List): List of predictions of each sample.
            references (List): List of targets for each sample.

        Returns:
            dict: preprocessed results.
        """
        mapping_to_int_dict = {
            label: idx
            for idx, label in enumerate(set(map(str, references)))
        }
        pred_set = set(predictions)
        for pred in pred_set:
            if str(pred) not in mapping_to_int_dict.keys():
                mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict)
        golds = [mapping_to_int_dict[str(gold)] for gold in references]
        preds = [mapping_to_int_dict[str(pred)] for pred in predictions]
        return {
            'predictions': preds,
            'references': golds,
        }

    def _postprocess(self, scores: dict) -> dict:
        """Postprocess for final scores.

        Args:
            scores (dict): Dict of calculated scores of metrics.

        Returns:
            dict: postprocessed scores.
        """
        scores['accuracy'] *= 100
        return scores


@ICL_EVALUATORS.register_module()
class RougeEvaluator(HuggingfaceEvaluator):
    """Rouge evaluator.

    Note: this evaluator is not suitable for chinese datasets.
    """

    def __init__(self) -> None:
        super().__init__(metric='rouge')

    def _postprocess(self, scores: dict) -> dict:
        """Postprocess for final scores.

        Args:
            scores (dict): Dict of calculated scores of metrics.

        Returns:
            dict: postprocessed scores.
        """
        return {k: v * 100 for k, v in scores.items()}


@ICL_EVALUATORS.register_module()
class BleuEvaluator(HuggingfaceEvaluator):
    """Bleu evaluator."""

    def __init__(self) -> None:
        super().__init__(metric='sacrebleu')


@ICL_EVALUATORS.register_module()
class MccEvaluator(AccEvaluator):
    """Matthews correlation evaluator."""

    def __init__(self) -> None:
        super(AccEvaluator, self).__init__(metric='matthews_correlation')

    def _postprocess(self, scores: dict) -> dict:
        """Postprocess for final scores.

        Args:
            scores (dict): Dict of calculated scores of metrics.

        Returns:
            dict: postprocessed scores.
        """
        scores['matthews_correlation'] *= 100
        return scores


@ICL_EVALUATORS.register_module()
class SquadEvaluator(HuggingfaceEvaluator):
    """Squad evaluator."""

    def __init__(self) -> None:
        super().__init__(metric='squad')

    def _preprocess(self, predictions: List, references: List) -> dict:
        """Preprocess the final predictions and references to needed format.

        Args:
            predictions (List): List of predictions of each sample.
            references (List): List of targets for each sample.

        Returns:
            dict: preprocessed results.
        """
        p_list = [{
            'prediction_text': pred.split('\n')[0],
            'id': str(i)
        } for i, pred in enumerate(predictions)]
        r_list = [{
            'answers': {
                'answer_start': [0],
                'text': [ref]
            },
            'id': str(i)
        } for i, ref in enumerate(references)]
        return {
            'predictions': p_list,
            'references': r_list,
        }

    def _postprocess(self, scores: dict) -> dict:
        """Postprocess for final scores.

        Args:
            scores (dict): Dict of calculated scores of metrics.

        Returns:
            dict: postprocessed scores.
        """
        return scores['f1']


@ICL_EVALUATORS.register_module()
class EDAccEvaluator(AccEvaluator):
    """Edit distance based accuracy evaluator.

    This implementation requires the un-postprocessed outputs from the model,
    and the reference list where each item is structured as:

    .. code-block:: python

        {
            'candidates': [],  # a list of informative answer candidates
            'label': 0,  # the index of the gold answer
        }

    It always matches the model's output to a valid answer with the citerion
    as the minimum editing distance.
    """

    def __init__(self) -> None:
        super().__init__()
        from rapidfuzz.distance import Levenshtein
        self.dist = Levenshtein.distance

    def _preprocess(self, predictions: List, references: List) -> dict:
        """Preprocess the final predictions and references to needed format.

        Args:
            predictions (List): List of predictions of each sample.
            references (List): List of targets for each sample.

        Returns:
            dict: preprocessed results.
        """

        preds = []
        golds = []

        for i in range(len(predictions)):
            pred, ref = predictions[i], references[i]
            dists = [self.dist(pred, cand) for cand in ref['candidates']]
            preds.append(np.argmin(dists))
            golds.append(ref['label'])

        return {
            'predictions': preds,
            'references': golds,
        }
[Feature] Use local accuracy from hf implements (#416) * use local accuracy from hf implements * add load from hf fallback 2023-09-20 16:35:22 +08:00			`import os`
[Fix] Fix seed in HFEvaluator (#122) 2023-07-28 11:29:01 +08:00			`import random`
initial commit 2023-07-04 21:34:55 +08:00			`from typing import List`
[Enhancement] Test linting in CI and fix existing linting errors (#69) * [Enhancement] Test linting in CI * fix linting 2023-07-17 15:59:10 +08:00
initial commit 2023-07-04 21:34:55 +08:00			`import evaluate`
[Fix] Fix seed in HFEvaluator (#122) 2023-07-28 11:29:01 +08:00			`import numpy as np`
initial commit 2023-07-04 21:34:55 +08:00
			`from opencompass.registry import ICL_EVALUATORS`

			`from .icl_base_evaluator import BaseEvaluator`


			`class HuggingfaceEvaluator(BaseEvaluator):`
			`"""Use huggingface evaluate module to calculate the target metrics.`

			`Args:`
			`metric (str): Metric name in evaluate module.`
[Fix] Fix seed in HFEvaluator (#122) 2023-07-28 11:29:01 +08:00			`seed (int): There exists some randomness during the calculation of some`
			`metrics, thus we set a fixed random seed for reproducing. Defaults`
			`to 0.`
initial commit 2023-07-04 21:34:55 +08:00			`"""`

[Fix] Fix seed in HFEvaluator (#122) 2023-07-28 11:29:01 +08:00			`def __init__(self, metric: str, seed: int = 0) -> None:`
initial commit 2023-07-04 21:34:55 +08:00			`self.metric = metric`
[Refine] Refine PR #122 (#123) * update * update 2023-08-03 14:54:38 +08:00			`self.seed = seed`
initial commit 2023-07-04 21:34:55 +08:00			`super().__init__()`

			`def _preprocess(self, predictions: List, references: List) -> dict:`
			`"""Preprocess the final predictions and references to needed format.`

			`Args:`
			`predictions (List): List of predictions of each sample.`
			`references (List): List of targets for each sample.`

			`Returns:`
			`dict: preprocessed results.`
			`"""`
			`return {`
			`'predictions': predictions,`
			`'references': references,`
			`}`

			`def _postprocess(self, scores: dict) -> dict:`
			`"""Postprocess for final scores.`

			`Args:`
			`scores (dict): Dict of calculated scores of metrics.`

			`Returns:`
			`dict: postprocessed scores.`
			`"""`
			`return scores`

			`def score(self, predictions: List, references: List) -> dict:`
			`"""Calculate scores.`

			`Args:`
			`predictions (List): List of predictions of each sample.`
			`references (List): List of targets for each sample.`

			`Returns:`
			`dict: calculated scores.`
			`"""`
[Refine] Refine PR #122 (#123) * update * update 2023-08-03 14:54:38 +08:00			`random_state = random.getstate()`
			`np_random_state = np.random.get_state()`

			`random.seed(self.seed)`
			`np.random.seed(self.seed)`
initial commit 2023-07-04 21:34:55 +08:00			`if len(predictions) != len(references):`
[Enhancement] Test linting in CI and fix existing linting errors (#69) * [Enhancement] Test linting in CI * fix linting 2023-07-17 15:59:10 +08:00			`return {`
			`'error':`
			`'predictions and references have different '`
initial commit 2023-07-04 21:34:55 +08:00			`f'length. len(predictions): {len(predictions)}, '`
[Enhancement] Test linting in CI and fix existing linting errors (#69) * [Enhancement] Test linting in CI * fix linting 2023-07-17 15:59:10 +08:00			`f'len(references): {len(references)}'`
			`}`
[Feature] Use local accuracy from hf implements (#416) * use local accuracy from hf implements * add load from hf fallback 2023-09-20 16:35:22 +08:00			`# use codes pre-downloaded to opencompass repo, avoid downloading`
Support GSM8k evaluation with tools by Lagent and LangChain (#277) * Support GSM8k evaluation with tools by Lagent and LangChain * Avoid to use MMEngine new feature * update document --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-09-22 15:28:22 +08:00			`local_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),`
[Feature] Use local accuracy from hf implements (#416) * use local accuracy from hf implements * add load from hf fallback 2023-09-20 16:35:22 +08:00			`'hf_metrics', self.metric + '.py')`
			`if os.path.exists(local_path):`
			`metric = evaluate.load(local_path)`
			`else:`
			`metric = evaluate.load(self.metric)`
initial commit 2023-07-04 21:34:55 +08:00			`scores = metric.compute(**self._preprocess(predictions, references))`
[Refine] Refine PR #122 (#123) * update * update 2023-08-03 14:54:38 +08:00			`result = self._postprocess(scores)`
			`random.setstate(random_state)`
			`np.random.set_state(np_random_state)`
			`return result`
initial commit 2023-07-04 21:34:55 +08:00

			`@ICL_EVALUATORS.register_module()`
			`class AccEvaluator(HuggingfaceEvaluator):`
			`"""Accuracy evaluator."""`

			`def __init__(self) -> None:`
			`super().__init__(metric='accuracy')`

			`def _preprocess(self, predictions: List, references: List) -> dict:`
			`"""Preprocess the final predictions and references to needed format.`

			`Args:`
			`predictions (List): List of predictions of each sample.`
			`references (List): List of targets for each sample.`

			`Returns:`
			`dict: preprocessed results.`
			`"""`
			`mapping_to_int_dict = {`
			`label: idx`
			`for idx, label in enumerate(set(map(str, references)))`
			`}`
			`pred_set = set(predictions)`
			`for pred in pred_set:`
			`if str(pred) not in mapping_to_int_dict.keys():`
			`mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict)`
			`golds = [mapping_to_int_dict[str(gold)] for gold in references]`
			`preds = [mapping_to_int_dict[str(pred)] for pred in predictions]`
			`return {`
			`'predictions': preds,`
			`'references': golds,`
			`}`

			`def _postprocess(self, scores: dict) -> dict:`
			`"""Postprocess for final scores.`

			`Args:`
			`scores (dict): Dict of calculated scores of metrics.`

			`Returns:`
			`dict: postprocessed scores.`
			`"""`
[Enhancement] Test linting in CI and fix existing linting errors (#69) * [Enhancement] Test linting in CI * fix linting 2023-07-17 15:59:10 +08:00			`scores['accuracy'] *= 100`
initial commit 2023-07-04 21:34:55 +08:00			`return scores`


			`@ICL_EVALUATORS.register_module()`
			`class RougeEvaluator(HuggingfaceEvaluator):`
[Fix] Use jieba rouge in lcsts (#459) * use jieba rouge in lcsts * use rouge_chinese 2023-10-09 10:10:33 +08:00			`"""Rouge evaluator.`

			`Note: this evaluator is not suitable for chinese datasets.`
			`"""`
initial commit 2023-07-04 21:34:55 +08:00
			`def __init__(self) -> None:`
			`super().__init__(metric='rouge')`

			`def _postprocess(self, scores: dict) -> dict:`
			`"""Postprocess for final scores.`

			`Args:`
			`scores (dict): Dict of calculated scores of metrics.`

			`Returns:`
			`dict: postprocessed scores.`
			`"""`
			`return {k: v * 100 for k, v in scores.items()}`


			`@ICL_EVALUATORS.register_module()`
			`class BleuEvaluator(HuggingfaceEvaluator):`
			`"""Bleu evaluator."""`

			`def __init__(self) -> None:`
			`super().__init__(metric='sacrebleu')`


			`@ICL_EVALUATORS.register_module()`
			`class MccEvaluator(AccEvaluator):`
			`"""Matthews correlation evaluator."""`

			`def __init__(self) -> None:`
			`super(AccEvaluator, self).__init__(metric='matthews_correlation')`

			`def _postprocess(self, scores: dict) -> dict:`
			`"""Postprocess for final scores.`

			`Args:`
			`scores (dict): Dict of calculated scores of metrics.`

			`Returns:`
			`dict: postprocessed scores.`
			`"""`
[Enhancement] Test linting in CI and fix existing linting errors (#69) * [Enhancement] Test linting in CI * fix linting 2023-07-17 15:59:10 +08:00			`scores['matthews_correlation'] *= 100`
initial commit 2023-07-04 21:34:55 +08:00			`return scores`


			`@ICL_EVALUATORS.register_module()`
			`class SquadEvaluator(HuggingfaceEvaluator):`
			`"""Squad evaluator."""`

			`def __init__(self) -> None:`
			`super().__init__(metric='squad')`

			`def _preprocess(self, predictions: List, references: List) -> dict:`
			`"""Preprocess the final predictions and references to needed format.`

			`Args:`
			`predictions (List): List of predictions of each sample.`
			`references (List): List of targets for each sample.`

			`Returns:`
			`dict: preprocessed results.`
			`"""`
			`p_list = [{`
			`'prediction_text': pred.split('\n')[0],`
			`'id': str(i)`
			`} for i, pred in enumerate(predictions)]`
			`r_list = [{`
			`'answers': {`
			`'answer_start': [0],`
			`'text': [ref]`
			`},`
			`'id': str(i)`
			`} for i, ref in enumerate(references)]`
			`return {`
			`'predictions': p_list,`
			`'references': r_list,`
			`}`

			`def _postprocess(self, scores: dict) -> dict:`
			`"""Postprocess for final scores.`

			`Args:`
			`scores (dict): Dict of calculated scores of metrics.`

			`Returns:`
			`dict: postprocessed scores.`
			`"""`
			`return scores['f1']`
[Feature] Evaluating acc based on minimum edit distance, update SIQA (#130) * [Feature] Support evaluating acc based on minimum edit distance, update SIQA * update 2023-08-01 14:24:27 +08:00

			`@ICL_EVALUATORS.register_module()`
			`class EDAccEvaluator(AccEvaluator):`
			`"""Edit distance based accuracy evaluator.`

			`This implementation requires the un-postprocessed outputs from the model,`
			`and the reference list where each item is structured as:`

			`.. code-block:: python`

			`{`
			`'candidates': [], # a list of informative answer candidates`
			`'label': 0, # the index of the gold answer`
			`}`

			`It always matches the model's output to a valid answer with the citerion`
			`as the minimum editing distance.`
			`"""`

			`def __init__(self) -> None:`
			`super().__init__()`
			`from rapidfuzz.distance import Levenshtein`
			`self.dist = Levenshtein.distance`

			`def _preprocess(self, predictions: List, references: List) -> dict:`
			`"""Preprocess the final predictions and references to needed format.`

			`Args:`
			`predictions (List): List of predictions of each sample.`
			`references (List): List of targets for each sample.`

			`Returns:`
			`dict: preprocessed results.`
			`"""`

			`preds = []`
			`golds = []`

			`for i in range(len(predictions)):`
			`pred, ref = predictions[i], references[i]`
			`dists = [self.dist(pred, cand) for cand in ref['candidates']]`
			`preds.append(np.argmin(dists))`
			`golds.append(ref['label'])`

			`return {`
			`'predictions': preds,`
			`'references': golds,`
			`}`