[Feature] Evaluating acc based on minimum edit distance, update SIQA (#130)

* [Feature] Support evaluating acc based on minimum edit distance, update SIQA * update
2025-05-30 16:03:24 +08:00 · 2023-08-01 14:24:27 +08:00 · 2023-08-01 14:24:27 +08:00 · c00179d46b
commit c00179d46b
parent e9b7b8ab02
3 changed files with 61 additions and 5 deletions
--- a/configs/datasets/siqa/siqa_gen_e78df3.py
+++ b/configs/datasets/siqa/siqa_gen_e78df3.py
@ -1,13 +1,12 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.openicl.icl_evaluator import EDAccEvaluator
 from opencompass.datasets import siqaDataset_V2
-from opencompass.utils.text_postprocessors import first_capital_postprocess

 siqa_reader_cfg = dict(
    input_columns=["context", "question", "answerA", "answerB", "answerC"],
-    output_column="label",
+    output_column="all_labels",
    test_split="validation")

 siqa_infer_cfg = dict(
@ -27,9 +26,8 @@ siqa_infer_cfg = dict(
 )

 siqa_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
+    evaluator=dict(type=EDAccEvaluator),
    pred_role="BOT",
-    pred_postprocessor=dict(type=first_capital_postprocess),
 )

 siqa_datasets = [
--- a/opencompass/datasets/siqa.py
+++ b/opencompass/datasets/siqa.py
@ -13,6 +13,15 @@ class siqaDataset_V2(BaseDataset):
        dataset = load_dataset(**kwargs)

        def preprocess(example):
+            example['all_labels'] = {
+                'candidates': [
+                    f'A. {example["answerA"]}',
+                    f'B. {example["answerB"]}',
+                    f'C. {example["answerC"]}',
+                ],
+                'label':
+                int(example['label']) - 1
+            }
            example['label'] = ' ABC'[int(example['label'])]
            return example

--- a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
@ -208,3 +208,52 @@ class SquadEvaluator(HuggingfaceEvaluator):
            dict: postprocessed scores.
        """
        return scores['f1']
+
+
+@ICL_EVALUATORS.register_module()
+class EDAccEvaluator(AccEvaluator):
+    """Edit distance based accuracy evaluator.
+
+    This implementation requires the un-postprocessed outputs from the model,
+    and the reference list where each item is structured as:
+
+    .. code-block:: python
+
+        {
+            'candidates': [],  # a list of informative answer candidates
+            'label': 0,  # the index of the gold answer
+        }
+
+    It always matches the model's output to a valid answer with the citerion
+    as the minimum editing distance.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        from rapidfuzz.distance import Levenshtein
+        self.dist = Levenshtein.distance
+
+    def _preprocess(self, predictions: List, references: List) -> dict:
+        """Preprocess the final predictions and references to needed format.
+
+        Args:
+            predictions (List): List of predictions of each sample.
+            references (List): List of targets for each sample.
+
+        Returns:
+            dict: preprocessed results.
+        """
+
+        preds = []
+        golds = []
+
+        for i in range(len(predictions)):
+            pred, ref = predictions[i], references[i]
+            dists = [self.dist(pred, cand) for cand in ref['candidates']]
+            preds.append(np.argmin(dists))
+            golds.append(ref['label'])
+
+        return {
+            'predictions': preds,
+            'references': golds,
+        }