[Feature] Evaluating acc based on minimum edit distance, update SIQA (#130)

* [Feature] Support evaluating acc based on minimum edit distance, update SIQA * update
2025-05-30 16:03:24 +08:00 · 2023-08-01 14:24:27 +08:00 · 2023-08-01 14:24:27 +08:00 · c00179d46b
commit c00179d46b
parent e9b7b8ab02
3 changed files with 61 additions and 5 deletions
--- a/configs/datasets/siqa/siqa_gen_e78df3.py
+++ b/configs/datasets/siqa/siqa_gen_e78df3.py
@ -1,13 +1,12 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.openicl.icl_evaluator import EDAccEvaluator
 from opencompass.datasets import siqaDataset_V2
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 siqa_reader_cfg = dict(
    input_columns=["context", "question", "answerA", "answerB", "answerC"],
-    output_column="label",
+    output_column="all_labels",
    test_split="validation")
 siqa_infer_cfg = dict(
@ -27,9 +26,8 @@ siqa_infer_cfg = dict(
 )
 siqa_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
+    evaluator=dict(type=EDAccEvaluator),
    pred_role="BOT",
    pred_postprocessor=dict(type=first_capital_postprocess),
 )
 siqa_datasets = [
--- a/opencompass/datasets/siqa.py
+++ b/opencompass/datasets/siqa.py
@ -13,6 +13,15 @@ class siqaDataset_V2(BaseDataset):
        dataset = load_dataset(**kwargs)
        def preprocess(example):
            example['all_labels'] = {
                'candidates': [
                    f'A. {example["answerA"]}',
                    f'B. {example["answerB"]}',
                    f'C. {example["answerC"]}',
                ],
                'label':
                int(example['label']) - 1
            }
            example['label'] = ' ABC'[int(example['label'])]
            return example
--- a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
@ -208,3 +208,52 @@ class SquadEvaluator(HuggingfaceEvaluator):
            dict: postprocessed scores.
        """
        return scores['f1']
@ICL_EVALUATORS.register_module()
 class EDAccEvaluator(AccEvaluator):
    """Edit distance based accuracy evaluator.
    This implementation requires the un-postprocessed outputs from the model,
    and the reference list where each item is structured as:
    .. code-block:: python
        {
            'candidates': [],  # a list of informative answer candidates
            'label': 0,  # the index of the gold answer
        }
    It always matches the model's output to a valid answer with the citerion
    as the minimum editing distance.
    """
    def __init__(self) -> None:
        super().__init__()
        from rapidfuzz.distance import Levenshtein
        self.dist = Levenshtein.distance
    def _preprocess(self, predictions: List, references: List) -> dict:
        """Preprocess the final predictions and references to needed format.
        Args:
            predictions (List): List of predictions of each sample.
            references (List): List of targets for each sample.
        Returns:
            dict: preprocessed results.
        """
        preds = []
        golds = []
        for i in range(len(predictions)):
            pred, ref = predictions[i], references[i]
            dists = [self.dist(pred, cand) for cand in ref['candidates']]
            preds.append(np.argmin(dists))
            golds.append(ref['label'])
        return {
            'predictions': preds,
            'references': golds,
        }