diff --git a/configs/datasets/siqa/siqa_gen_e78df3.py b/configs/datasets/siqa/siqa_gen_e78df3.py index 98537f2f..a470739b 100644 --- a/configs/datasets/siqa/siqa_gen_e78df3.py +++ b/configs/datasets/siqa/siqa_gen_e78df3.py @@ -1,13 +1,12 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.openicl.icl_evaluator import EDAccEvaluator from opencompass.datasets import siqaDataset_V2 -from opencompass.utils.text_postprocessors import first_capital_postprocess siqa_reader_cfg = dict( input_columns=["context", "question", "answerA", "answerB", "answerC"], - output_column="label", + output_column="all_labels", test_split="validation") siqa_infer_cfg = dict( @@ -27,9 +26,8 @@ siqa_infer_cfg = dict( ) siqa_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), + evaluator=dict(type=EDAccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type=first_capital_postprocess), ) siqa_datasets = [ diff --git a/opencompass/datasets/siqa.py b/opencompass/datasets/siqa.py index 5091ccd0..37409631 100644 --- a/opencompass/datasets/siqa.py +++ b/opencompass/datasets/siqa.py @@ -13,6 +13,15 @@ class siqaDataset_V2(BaseDataset): dataset = load_dataset(**kwargs) def preprocess(example): + example['all_labels'] = { + 'candidates': [ + f'A. {example["answerA"]}', + f'B. {example["answerB"]}', + f'C. {example["answerC"]}', + ], + 'label': + int(example['label']) - 1 + } example['label'] = ' ABC'[int(example['label'])] return example diff --git a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py index 8adfa678..89fb1d17 100644 --- a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py @@ -208,3 +208,52 @@ class SquadEvaluator(HuggingfaceEvaluator): dict: postprocessed scores. """ return scores['f1'] + + +@ICL_EVALUATORS.register_module() +class EDAccEvaluator(AccEvaluator): + """Edit distance based accuracy evaluator. + + This implementation requires the un-postprocessed outputs from the model, + and the reference list where each item is structured as: + + .. code-block:: python + + { + 'candidates': [], # a list of informative answer candidates + 'label': 0, # the index of the gold answer + } + + It always matches the model's output to a valid answer with the citerion + as the minimum editing distance. + """ + + def __init__(self) -> None: + super().__init__() + from rapidfuzz.distance import Levenshtein + self.dist = Levenshtein.distance + + def _preprocess(self, predictions: List, references: List) -> dict: + """Preprocess the final predictions and references to needed format. + + Args: + predictions (List): List of predictions of each sample. + references (List): List of targets for each sample. + + Returns: + dict: preprocessed results. + """ + + preds = [] + golds = [] + + for i in range(len(predictions)): + pred, ref = predictions[i], references[i] + dists = [self.dist(pred, cand) for cand in ref['candidates']] + preds.append(np.argmin(dists)) + golds.append(ref['label']) + + return { + 'predictions': preds, + 'references': golds, + }