OpenCompass/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py

import os
import random
from typing import List, Optional

import evaluate
import numpy as np
from datasets import Dataset
from mmengine.config import ConfigDict

from opencompass.registry import ICL_EVALUATORS

from .icl_base_evaluator import BaseEvaluator


class HuggingfaceEvaluator(BaseEvaluator):
    """Use huggingface evaluate module to calculate the target metrics.

    Args:
        metric (str): Metric name in evaluate module.
        seed (int): There exists some randomness during the calculation of some
            metrics, thus we set a fixed random seed for reproducing. Defaults
            to 0.
        pred_postprocessor (optional): Function or configuration for prediction
            post-processing.
    """

    def __init__(self,
                 metric: str,
                 seed: int = 0,
                 pred_postprocessor=None) -> None:
        self.metric = metric
        self.seed = seed
        super().__init__(pred_postprocessor=pred_postprocessor)

    def _preprocess(self, predictions: List, references: List) -> dict:
        """Preprocess the final predictions and references to needed format.

        Args:
            predictions (List): List of predictions of each sample.
            references (List): List of targets for each sample.

        Returns:
            dict: preprocessed results.
        """
        return {
            'predictions': self.pred_postprocess(predictions),
            'references': references,
        }

    def _postprocess(self, scores: dict) -> dict:
        """Postprocess for final scores.

        Args:
            scores (dict): Dict of calculated scores of metrics.

        Returns:
            dict: postprocessed scores.
        """
        return scores

    def score(self,
              predictions: List,
              references: List,
              test_set=None) -> dict:
        """Calculate scores.

        Args:
            predictions (List): List of predictions of each sample.
            references (List): List of targets for each sample.

        Returns:
            dict: calculated scores.
        """
        random_state = random.getstate()
        np_random_state = np.random.get_state()

        random.seed(self.seed)
        np.random.seed(self.seed)
        if len(predictions) != len(references):
            return {
                'error':
                'predictions and references have different '
                f'length. len(predictions): {len(predictions)}, '
                f'len(references): {len(references)}'
            }
        # use codes pre-downloaded to opencompass repo, avoid downloading
        local_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                  'hf_metrics', self.metric + '.py')
        if os.path.exists(local_path):
            metric = evaluate.load(local_path)
        else:
            metric = evaluate.load(self.metric)
        scores = metric.compute(**self._preprocess(predictions, references))
        result = self._postprocess(scores)
        random.setstate(random_state)
        np.random.set_state(np_random_state)
        return result


@ICL_EVALUATORS.register_module()
class AccEvaluator(HuggingfaceEvaluator):
    """Accuracy evaluator."""

    def __init__(self,
                 pred_postprocessor: Optional[ConfigDict] = None) -> None:
        super().__init__(metric='accuracy',
                         pred_postprocessor=pred_postprocessor)

    def _preprocess(self,
                    predictions: List,
                    references: List,
                    test_set=None) -> dict:
        """Preprocess the final predictions and references to needed format.

        Args:
            predictions (List): List of predictions of each sample.
            references (List): List of targets for each sample.

        Returns:
            dict: preprocessed results.
        """
        mapping_to_int_dict = {
            label: idx
            for idx, label in enumerate(set(map(str, references)))
        }
        pred_set = set(predictions)
        for pred in pred_set:
            if str(pred) not in mapping_to_int_dict.keys():
                mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict)
        golds = [mapping_to_int_dict[str(gold)] for gold in references]
        preds = [mapping_to_int_dict[str(pred)] for pred in predictions]
        return {
            'predictions': preds,
            'references': golds,
        }

    def _postprocess(self, scores: dict) -> dict:
        """Postprocess for final scores.

        Args:
            scores (dict): Dict of calculated scores of metrics.

        Returns:
            dict: postprocessed scores.
        """
        scores['accuracy'] *= 100
        return scores


@ICL_EVALUATORS.register_module()
class AccContaminationEvaluator(AccEvaluator):
    """Accuracy evaluator."""

    def score(self, predictions: List, references: List,
              test_set: Dataset) -> dict:
        # group the predictions and references by their contamination status
        clean_predictions, clean_references = [], []
        input_contaminated_predictions, input_contaminated_references = [], []
        input_and_label_contaminated_predictions, \
            input_and_label_contaminated_references = [], []
        for pred, ref, is_clean in zip(predictions, references,
                                       test_set['is_clean']):
            if is_clean == 'clean':
                clean_predictions.append(pred)
                clean_references.append(ref)
            elif is_clean == 'input contamination':
                input_contaminated_predictions.append(pred)
                input_contaminated_references.append(ref)
            elif is_clean == 'input-and-label contamination':
                input_and_label_contaminated_predictions.append(pred)
                input_and_label_contaminated_references.append(ref)
        clean_results = super().score(clean_predictions, clean_references)
        input_contaminated_results = super().score(
            input_contaminated_predictions, input_contaminated_references)
        input_and_label_contaminated_results = super().score(
            input_and_label_contaminated_predictions,
            input_and_label_contaminated_references)

        # rename the keys of the results, add 'clean, 'input contaminated',
        # 'input-and-label contaminated' as prefixes
        clean_results = {f'{k} - clean': v for k, v in clean_results.items()}
        input_contaminated_results = {
            f'{k} - input contaminated': v
            for k, v in input_contaminated_results.items()
        }
        input_and_label_contaminated_results = {
            f'{k} - input-and-label contaminated': v
            for k, v in input_and_label_contaminated_results.items()
        }
        return {
            **clean_results,
            **input_contaminated_results,
            **input_and_label_contaminated_results
        }


@ICL_EVALUATORS.register_module()
class RougeEvaluator(HuggingfaceEvaluator):
    """Rouge evaluator.

    Note: this evaluator is not suitable for chinese datasets.
    """

    def __init__(self,
                 pred_postprocessor: Optional[ConfigDict] = None) -> None:
        super().__init__(metric='rouge', pred_postprocessor=pred_postprocessor)

    def _postprocess(self, scores: dict) -> dict:
        """Postprocess for final scores.

        Args:
            scores (dict): Dict of calculated scores of metrics.

        Returns:
            dict: postprocessed scores.
        """
        return {k: v * 100 for k, v in scores.items()}


@ICL_EVALUATORS.register_module()
class BleuEvaluator(HuggingfaceEvaluator):
    """Bleu evaluator."""

    def __init__(self,
                 pred_postprocessor: Optional[ConfigDict] = None) -> None:
        super().__init__(metric='sacrebleu',
                         pred_postprocessor=pred_postprocessor)


class BleuFloresEvaluator(HuggingfaceEvaluator):
    """Bleu evaluator using flores200 tokenize."""

    def __init__(self) -> None:
        super().__init__(metric='sacrebleu')

    def _preprocess(self, predictions: List, references: List) -> dict:
        return {
            'predictions': predictions,
            'references': references,
            'tokenize': 'flores200',
        }


@ICL_EVALUATORS.register_module()
class MccEvaluator(AccEvaluator):
    """Matthews correlation evaluator."""

    def __init__(self) -> None:
        super(AccEvaluator, self).__init__(metric='matthews_correlation')

    def _postprocess(self, scores: dict) -> dict:
        """Postprocess for final scores.

        Args:
            scores (dict): Dict of calculated scores of metrics.

        Returns:
            dict: postprocessed scores.
        """
        scores['matthews_correlation'] *= 100
        return scores


@ICL_EVALUATORS.register_module()
class SquadEvaluator(HuggingfaceEvaluator):
    """Squad evaluator."""

    def __init__(self) -> None:
        super().__init__(metric='squad')

    def _preprocess(self, predictions: List, references: List) -> dict:
        """Preprocess the final predictions and references to needed format.

        Args:
            predictions (List): List of predictions of each sample.
            references (List): List of targets for each sample.

        Returns:
            dict: preprocessed results.
        """
        p_list = [{
            'prediction_text': pred.split('\n')[0],
            'id': str(i)
        } for i, pred in enumerate(predictions)]
        r_list = [{
            'answers': {
                'answer_start': [0],
                'text': [ref]
            },
            'id': str(i)
        } for i, ref in enumerate(references)]
        return {
            'predictions': p_list,
            'references': r_list,
        }

    def _postprocess(self, scores: dict) -> dict:
        """Postprocess for final scores.

        Args:
            scores (dict): Dict of calculated scores of metrics.

        Returns:
            dict: postprocessed scores.
        """
        return scores['f1']


@ICL_EVALUATORS.register_module()
class EDAccEvaluator(AccEvaluator):
    """Edit distance based accuracy evaluator.

    This implementation requires the un-postprocessed outputs from the model,
    and the reference list where each item is structured as:

    .. code-block:: python

        {
            'candidates': [],  # a list of informative answer candidates
            'label': 0,  # the index of the gold answer
        }

    It always matches the model's output to a valid answer with the citerion
    as the minimum editing distance.
    """

    def __init__(self) -> None:
        super().__init__()
        from rapidfuzz.distance import Levenshtein
        self.dist = Levenshtein.distance

    def _preprocess(self, predictions: List, references: List) -> dict:
        """Preprocess the final predictions and references to needed format.

        Args:
            predictions (List): List of predictions of each sample.
            references (List): List of targets for each sample.

        Returns:
            dict: preprocessed results.
        """

        preds = []
        golds = []

        for i in range(len(predictions)):
            pred, ref = predictions[i], references[i]
            dists = []
            for cands in ref['candidates']:
                if isinstance(cands, str):
                    d = self.dist(pred, cands)
                else:
                    d = np.min([self.dist(pred, cand) for cand in cands])
                dists.append(d)
            preds.append(np.argmin(dists))
            golds.append(ref['label'])

        return {
            'predictions': preds,
            'references': golds,
        }


@ICL_EVALUATORS.register_module()
class AccwithDetailsEvaluator(BaseEvaluator):

    def score(self, predictions, references, origin_prompt) -> dict:

        if len(predictions) != len(references):
            return {'error': 'preds and refrs have different length.'}

        details = {}
        correct, total = 0, 0
        for index, (pred, ref) in enumerate(zip(predictions, references)):
            is_correct = pred == ref
            correct += is_correct
            details[str(index)] = {
                'prompt': origin_prompt[index],
                'pred': pred,
                'refr': ref,
                'is_correct': is_correct,
            }
            total += 1

        results = {'accuracy': correct / total * 100, 'details': details}

        return results
[Feature] Use local accuracy from hf implements (#416) * use local accuracy from hf implements * add load from hf fallback 2023-09-20 16:35:22 +08:00			`import os`
[Fix] Fix seed in HFEvaluator (#122) 2023-07-28 11:29:01 +08:00			`import random`
Update Config 2025-04-25 16:35:28 +08:00			`from typing import List, Optional`
[Enhancement] Test linting in CI and fix existing linting errors (#69) * [Enhancement] Test linting in CI * fix linting 2023-07-17 15:59:10 +08:00
initial commit 2023-07-04 21:34:55 +08:00			`import evaluate`
[Fix] Fix seed in HFEvaluator (#122) 2023-07-28 11:29:01 +08:00			`import numpy as np`
[Feature] Add Data Contamination Analysis (#639) * add contamination analysis to ceval * fix bugs * add contamination docs * to pass CI check * update --------- Co-authored-by: zhangyifan1 <zhangyifan1@pjlab.org.cn> Co-authored-by: Leymore <zfz-960727@163.com> 2023-12-08 10:00:11 +08:00			`from datasets import Dataset`
Update Config 2025-04-25 16:35:28 +08:00			`from mmengine.config import ConfigDict`
initial commit 2023-07-04 21:34:55 +08:00
			`from opencompass.registry import ICL_EVALUATORS`

			`from .icl_base_evaluator import BaseEvaluator`


			`class HuggingfaceEvaluator(BaseEvaluator):`
			`"""Use huggingface evaluate module to calculate the target metrics.`

			`Args:`
			`metric (str): Metric name in evaluate module.`
[Fix] Fix seed in HFEvaluator (#122) 2023-07-28 11:29:01 +08:00			`seed (int): There exists some randomness during the calculation of some`
			`metrics, thus we set a fixed random seed for reproducing. Defaults`
			`to 0.`
Update 2025-05-06 21:50:39 +08:00			`pred_postprocessor (optional): Function or configuration for prediction`
			`post-processing.`
initial commit 2023-07-04 21:34:55 +08:00			`"""`

Update Config 2025-04-25 16:35:28 +08:00			`def __init__(self,`
			`metric: str,`
			`seed: int = 0,`
			`pred_postprocessor=None) -> None:`
initial commit 2023-07-04 21:34:55 +08:00			`self.metric = metric`
[Refine] Refine PR #122 (#123) * update * update 2023-08-03 14:54:38 +08:00			`self.seed = seed`
Update Config 2025-04-25 16:35:28 +08:00			`super().__init__(pred_postprocessor=pred_postprocessor)`
initial commit 2023-07-04 21:34:55 +08:00
			`def _preprocess(self, predictions: List, references: List) -> dict:`
			`"""Preprocess the final predictions and references to needed format.`

			`Args:`
			`predictions (List): List of predictions of each sample.`
			`references (List): List of targets for each sample.`

			`Returns:`
			`dict: preprocessed results.`
			`"""`
			`return {`
Update Config 2025-04-25 16:35:28 +08:00			`'predictions': self.pred_postprocess(predictions),`
initial commit 2023-07-04 21:34:55 +08:00			`'references': references,`
			`}`

			`def _postprocess(self, scores: dict) -> dict:`
			`"""Postprocess for final scores.`

			`Args:`
			`scores (dict): Dict of calculated scores of metrics.`

			`Returns:`
			`dict: postprocessed scores.`
			`"""`
			`return scores`

Update 2025-05-06 14:51:44 +08:00			`def score(self,`
			`predictions: List,`
			`references: List,`
			`test_set=None) -> dict:`
initial commit 2023-07-04 21:34:55 +08:00			`"""Calculate scores.`

			`Args:`
			`predictions (List): List of predictions of each sample.`
			`references (List): List of targets for each sample.`

			`Returns:`
			`dict: calculated scores.`
			`"""`
[Refine] Refine PR #122 (#123) * update * update 2023-08-03 14:54:38 +08:00			`random_state = random.getstate()`
			`np_random_state = np.random.get_state()`

			`random.seed(self.seed)`
			`np.random.seed(self.seed)`
initial commit 2023-07-04 21:34:55 +08:00			`if len(predictions) != len(references):`
[Enhancement] Test linting in CI and fix existing linting errors (#69) * [Enhancement] Test linting in CI * fix linting 2023-07-17 15:59:10 +08:00			`return {`
			`'error':`
			`'predictions and references have different '`
initial commit 2023-07-04 21:34:55 +08:00			`f'length. len(predictions): {len(predictions)}, '`
[Enhancement] Test linting in CI and fix existing linting errors (#69) * [Enhancement] Test linting in CI * fix linting 2023-07-17 15:59:10 +08:00			`f'len(references): {len(references)}'`
			`}`
[Feature] Use local accuracy from hf implements (#416) * use local accuracy from hf implements * add load from hf fallback 2023-09-20 16:35:22 +08:00			`# use codes pre-downloaded to opencompass repo, avoid downloading`
Support GSM8k evaluation with tools by Lagent and LangChain (#277) * Support GSM8k evaluation with tools by Lagent and LangChain * Avoid to use MMEngine new feature * update document --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-09-22 15:28:22 +08:00			`local_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),`
[Feature] Use local accuracy from hf implements (#416) * use local accuracy from hf implements * add load from hf fallback 2023-09-20 16:35:22 +08:00			`'hf_metrics', self.metric + '.py')`
			`if os.path.exists(local_path):`
			`metric = evaluate.load(local_path)`
			`else:`
			`metric = evaluate.load(self.metric)`
initial commit 2023-07-04 21:34:55 +08:00			`scores = metric.compute(**self._preprocess(predictions, references))`
[Refine] Refine PR #122 (#123) * update * update 2023-08-03 14:54:38 +08:00			`result = self._postprocess(scores)`
			`random.setstate(random_state)`
			`np.random.set_state(np_random_state)`
			`return result`
initial commit 2023-07-04 21:34:55 +08:00

			`@ICL_EVALUATORS.register_module()`
			`class AccEvaluator(HuggingfaceEvaluator):`
			`"""Accuracy evaluator."""`

Update Config 2025-04-25 16:35:28 +08:00			`def __init__(self,`
			`pred_postprocessor: Optional[ConfigDict] = None) -> None:`
			`super().__init__(metric='accuracy',`
			`pred_postprocessor=pred_postprocessor)`
initial commit 2023-07-04 21:34:55 +08:00
Update 2025-05-06 14:51:44 +08:00			`def _preprocess(self,`
			`predictions: List,`
			`references: List,`
			`test_set=None) -> dict:`
initial commit 2023-07-04 21:34:55 +08:00			`"""Preprocess the final predictions and references to needed format.`

			`Args:`
			`predictions (List): List of predictions of each sample.`
			`references (List): List of targets for each sample.`

			`Returns:`
			`dict: preprocessed results.`
			`"""`
			`mapping_to_int_dict = {`
			`label: idx`
			`for idx, label in enumerate(set(map(str, references)))`
			`}`
			`pred_set = set(predictions)`
			`for pred in pred_set:`
			`if str(pred) not in mapping_to_int_dict.keys():`
			`mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict)`
			`golds = [mapping_to_int_dict[str(gold)] for gold in references]`
			`preds = [mapping_to_int_dict[str(pred)] for pred in predictions]`
			`return {`
			`'predictions': preds,`
			`'references': golds,`
			`}`

			`def _postprocess(self, scores: dict) -> dict:`
			`"""Postprocess for final scores.`

			`Args:`
			`scores (dict): Dict of calculated scores of metrics.`

			`Returns:`
			`dict: postprocessed scores.`
			`"""`
[Enhancement] Test linting in CI and fix existing linting errors (#69) * [Enhancement] Test linting in CI * fix linting 2023-07-17 15:59:10 +08:00			`scores['accuracy'] *= 100`
initial commit 2023-07-04 21:34:55 +08:00			`return scores`


[Feature] Add Data Contamination Analysis (#639) * add contamination analysis to ceval * fix bugs * add contamination docs * to pass CI check * update --------- Co-authored-by: zhangyifan1 <zhangyifan1@pjlab.org.cn> Co-authored-by: Leymore <zfz-960727@163.com> 2023-12-08 10:00:11 +08:00			`@ICL_EVALUATORS.register_module()`
			`class AccContaminationEvaluator(AccEvaluator):`
			`"""Accuracy evaluator."""`

			`def score(self, predictions: List, references: List,`
			`test_set: Dataset) -> dict:`
			`# group the predictions and references by their contamination status`
			`clean_predictions, clean_references = [], []`
			`input_contaminated_predictions, input_contaminated_references = [], []`
			`input_and_label_contaminated_predictions, \`
			`input_and_label_contaminated_references = [], []`
			`for pred, ref, is_clean in zip(predictions, references,`
			`test_set['is_clean']):`
			`if is_clean == 'clean':`
			`clean_predictions.append(pred)`
			`clean_references.append(ref)`
			`elif is_clean == 'input contamination':`
			`input_contaminated_predictions.append(pred)`
			`input_contaminated_references.append(ref)`
			`elif is_clean == 'input-and-label contamination':`
			`input_and_label_contaminated_predictions.append(pred)`
			`input_and_label_contaminated_references.append(ref)`
			`clean_results = super().score(clean_predictions, clean_references)`
			`input_contaminated_results = super().score(`
			`input_contaminated_predictions, input_contaminated_references)`
			`input_and_label_contaminated_results = super().score(`
			`input_and_label_contaminated_predictions,`
			`input_and_label_contaminated_references)`

			`# rename the keys of the results, add 'clean, 'input contaminated',`
			`# 'input-and-label contaminated' as prefixes`
			`clean_results = {f'{k} - clean': v for k, v in clean_results.items()}`
			`input_contaminated_results = {`
			`f'{k} - input contaminated': v`
			`for k, v in input_contaminated_results.items()`
			`}`
			`input_and_label_contaminated_results = {`
			`f'{k} - input-and-label contaminated': v`
			`for k, v in input_and_label_contaminated_results.items()`
			`}`
			`return {`
			`**clean_results,`
			`**input_contaminated_results,`
			`**input_and_label_contaminated_results`
			`}`


initial commit 2023-07-04 21:34:55 +08:00			`@ICL_EVALUATORS.register_module()`
			`class RougeEvaluator(HuggingfaceEvaluator):`
[Fix] Use jieba rouge in lcsts (#459) * use jieba rouge in lcsts * use rouge_chinese 2023-10-09 10:10:33 +08:00			`"""Rouge evaluator.`

			`Note: this evaluator is not suitable for chinese datasets.`
			`"""`
initial commit 2023-07-04 21:34:55 +08:00
Update Config 2025-04-25 16:35:28 +08:00			`def __init__(self,`
			`pred_postprocessor: Optional[ConfigDict] = None) -> None:`
			`super().__init__(metric='rouge', pred_postprocessor=pred_postprocessor)`
initial commit 2023-07-04 21:34:55 +08:00
			`def _postprocess(self, scores: dict) -> dict:`
			`"""Postprocess for final scores.`

			`Args:`
			`scores (dict): Dict of calculated scores of metrics.`

			`Returns:`
			`dict: postprocessed scores.`
			`"""`
			`return {k: v * 100 for k, v in scores.items()}`


			`@ICL_EVALUATORS.register_module()`
			`class BleuEvaluator(HuggingfaceEvaluator):`
			`"""Bleu evaluator."""`

Update Config 2025-04-25 16:35:28 +08:00			`def __init__(self,`
			`pred_postprocessor: Optional[ConfigDict] = None) -> None:`
			`super().__init__(metric='sacrebleu',`
			`pred_postprocessor=pred_postprocessor)`
initial commit 2023-07-04 21:34:55 +08:00

[Sync] Sync with internal codes 2023.01.08 (#777) 2024-01-08 22:07:24 +08:00			`class BleuFloresEvaluator(HuggingfaceEvaluator):`
			`"""Bleu evaluator using flores200 tokenize."""`

			`def __init__(self) -> None:`
			`super().__init__(metric='sacrebleu')`

			`def _preprocess(self, predictions: List, references: List) -> dict:`
			`return {`
			`'predictions': predictions,`
			`'references': references,`
			`'tokenize': 'flores200',`
			`}`


initial commit 2023-07-04 21:34:55 +08:00			`@ICL_EVALUATORS.register_module()`
			`class MccEvaluator(AccEvaluator):`
			`"""Matthews correlation evaluator."""`

			`def __init__(self) -> None:`
			`super(AccEvaluator, self).__init__(metric='matthews_correlation')`

			`def _postprocess(self, scores: dict) -> dict:`
			`"""Postprocess for final scores.`

			`Args:`
			`scores (dict): Dict of calculated scores of metrics.`

			`Returns:`
			`dict: postprocessed scores.`
			`"""`
[Enhancement] Test linting in CI and fix existing linting errors (#69) * [Enhancement] Test linting in CI * fix linting 2023-07-17 15:59:10 +08:00			`scores['matthews_correlation'] *= 100`
initial commit 2023-07-04 21:34:55 +08:00			`return scores`


			`@ICL_EVALUATORS.register_module()`
			`class SquadEvaluator(HuggingfaceEvaluator):`
			`"""Squad evaluator."""`

			`def __init__(self) -> None:`
			`super().__init__(metric='squad')`

			`def _preprocess(self, predictions: List, references: List) -> dict:`
			`"""Preprocess the final predictions and references to needed format.`

			`Args:`
			`predictions (List): List of predictions of each sample.`
			`references (List): List of targets for each sample.`

			`Returns:`
			`dict: preprocessed results.`
			`"""`
			`p_list = [{`
			`'prediction_text': pred.split('\n')[0],`
			`'id': str(i)`
			`} for i, pred in enumerate(predictions)]`
			`r_list = [{`
			`'answers': {`
			`'answer_start': [0],`
			`'text': [ref]`
			`},`
			`'id': str(i)`
			`} for i, ref in enumerate(references)]`
			`return {`
			`'predictions': p_list,`
			`'references': r_list,`
			`}`

			`def _postprocess(self, scores: dict) -> dict:`
			`"""Postprocess for final scores.`

			`Args:`
			`scores (dict): Dict of calculated scores of metrics.`

			`Returns:`
			`dict: postprocessed scores.`
			`"""`
			`return scores['f1']`
[Feature] Evaluating acc based on minimum edit distance, update SIQA (#130) * [Feature] Support evaluating acc based on minimum edit distance, update SIQA * update 2023-08-01 14:24:27 +08:00

			`@ICL_EVALUATORS.register_module()`
			`class EDAccEvaluator(AccEvaluator):`
			`"""Edit distance based accuracy evaluator.`

			`This implementation requires the un-postprocessed outputs from the model,`
			`and the reference list where each item is structured as:`

			`.. code-block:: python`

			`{`
			`'candidates': [], # a list of informative answer candidates`
			`'label': 0, # the index of the gold answer`
			`}`

			`It always matches the model's output to a valid answer with the citerion`
			`as the minimum editing distance.`
			`"""`

			`def __init__(self) -> None:`
			`super().__init__()`
			`from rapidfuzz.distance import Levenshtein`
			`self.dist = Levenshtein.distance`

			`def _preprocess(self, predictions: List, references: List) -> dict:`
			`"""Preprocess the final predictions and references to needed format.`

			`Args:`
			`predictions (List): List of predictions of each sample.`
			`references (List): List of targets for each sample.`

			`Returns:`
			`dict: preprocessed results.`
			`"""`

			`preds = []`
			`golds = []`

			`for i in range(len(predictions)):`
			`pred, ref = predictions[i], references[i]`
[Sync] update model configs (#574) 2023-11-13 15:15:34 +08:00			`dists = []`
			`for cands in ref['candidates']:`
			`if isinstance(cands, str):`
			`d = self.dist(pred, cands)`
			`else:`
			`d = np.min([self.dist(pred, cand) for cand in cands])`
			`dists.append(d)`
[Feature] Evaluating acc based on minimum edit distance, update SIQA (#130) * [Feature] Support evaluating acc based on minimum edit distance, update SIQA * update 2023-08-01 14:24:27 +08:00			`preds.append(np.argmin(dists))`
			`golds.append(ref['label'])`

			`return {`
			`'predictions': preds,`
			`'references': golds,`
			`}`
[Sync] update evaluator (#1175) 2024-05-21 14:22:46 +08:00

			`@ICL_EVALUATORS.register_module()`
			`class AccwithDetailsEvaluator(BaseEvaluator):`

			`def score(self, predictions, references, origin_prompt) -> dict:`

			`if len(predictions) != len(references):`
			`return {'error': 'preds and refrs have different length.'}`

			`details = {}`
			`correct, total = 0, 0`
			`for index, (pred, ref) in enumerate(zip(predictions, references)):`
			`is_correct = pred == ref`
			`correct += is_correct`
			`details[str(index)] = {`
			`'prompt': origin_prompt[index],`
			`'pred': pred,`
			`'refr': ref,`
			`'is_correct': is_correct,`
			`}`
			`total += 1`

			`results = {'accuracy': correct / total * 100, 'details': details}`

			`return results`