diff --git a/configs/datasets/adv_glue/__init__.py b/configs/datasets/adv_glue/__init__.py new file mode 100644 index 00000000..50749cb8 --- /dev/null +++ b/configs/datasets/adv_glue/__init__.py @@ -0,0 +1,11 @@ +from mmengine.config import read_base + +with read_base(): + from .adv_glue_sst2.adv_glue_sst2_gen import adv_sst2_datasets + from .adv_glue_qqp.adv_glue_qqp_gen import adv_qqp_datasets + from .adv_glue_rte.adv_glue_rte_gen import adv_rte_datasets + from .adv_glue_qnli.adv_glue_qnli_gen import adv_qnli_datasets + from .adv_glue_mnli.adv_glue_mnli_gen import adv_mnli_datasets + from .adv_glue_mnli_mm.adv_glue_mnli_mm_gen import adv_mnli_mm_datasets + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen.py b/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen.py new file mode 100644 index 00000000..e3e702c2 --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .adv_glue_mnli_gen_bd8ef0 import adv_mnli_datasets # noqa: F401, F403 diff --git a/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen_bd8ef0.py b/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen_bd8ef0.py new file mode 100644 index 00000000..c27798e3 --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen_bd8ef0.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import AdvMnliDataset, AccDropEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + +adv_mnli_reader_cfg = dict( + input_columns=['premise', 'hypothesis'], output_column='label_option') + +adv_mnli_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + """Please identify whether the premise entails the hypothesis. The answer should be exactly 'A. yes', 'B. maybe' or 'C. no'. +premise: {premise} +hypothesis: {hypothesis} +Answer:"""), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +adv_mnli_eval_cfg = dict( + evaluator=dict(type=AccDropEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_option_postprocess, options='ABC'), +) + +adv_mnli_datasets = [ + dict( + abbr='adv_mnli', + type=AdvMnliDataset, + path='./data/adv_glue/dev_ann.json', + reader_cfg=adv_mnli_reader_cfg, + infer_cfg=adv_mnli_infer_cfg, + eval_cfg=adv_mnli_eval_cfg, + ) +] diff --git a/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen.py b/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen.py new file mode 100644 index 00000000..a9d88beb --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .adv_glue_mnli_mm_gen_bd8ef0 import adv_mnli_mm_datasets # noqa: F401, F403 diff --git a/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen_bd8ef0.py b/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen_bd8ef0.py new file mode 100644 index 00000000..ec4c2f1e --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen_bd8ef0.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import AdvMnliMMDataset, AccDropEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + +adv_mnli_mm_reader_cfg = dict( + input_columns=['premise', 'hypothesis'], output_column='label_option') + +adv_mnli_mm_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + """Please identify whether the premise entails the hypothesis. The answer should be exactly 'A. yes', 'B. maybe' or 'C. no'. +premise: {premise} +hypothesis: {hypothesis} +Answer:"""), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +adv_mnli_mm_eval_cfg = dict( + evaluator=dict(type=AccDropEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_option_postprocess, options='ABC'), +) + +adv_mnli_mm_datasets = [ + dict( + abbr='adv_mnli_mm', + type=AdvMnliMMDataset, + path='./data/adv_glue/dev_ann.json', + reader_cfg=adv_mnli_mm_reader_cfg, + infer_cfg=adv_mnli_mm_infer_cfg, + eval_cfg=adv_mnli_mm_eval_cfg, + ) +] diff --git a/configs/datasets/adv_glue/adv_glue_qnli/adv_glue_qnli_gen.py b/configs/datasets/adv_glue/adv_glue_qnli/adv_glue_qnli_gen.py new file mode 100644 index 00000000..148f651b --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_qnli/adv_glue_qnli_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .adv_glue_qnli_gen_0b7326 import adv_qnli_datasets # noqa: F401, F403 diff --git a/configs/datasets/adv_glue/adv_glue_qnli/adv_glue_qnli_gen_0b7326.py b/configs/datasets/adv_glue/adv_glue_qnli/adv_glue_qnli_gen_0b7326.py new file mode 100644 index 00000000..3bde5886 --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_qnli/adv_glue_qnli_gen_0b7326.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import AdvQnliDataset, AccDropEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + +adv_qnli_reader_cfg = dict( + input_columns=['question', 'sentence'], output_column='label_option') + +adv_qnli_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + """Please identify whether the sentence answers the question. The answer should be exactly 'A. yes' or 'B. no'. +question: {question} +sentence: {sentence} +Answer:"""), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +adv_qnli_eval_cfg = dict( + evaluator=dict(type=AccDropEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_option_postprocess, options='AB'), +) + +adv_qnli_datasets = [ + dict( + abbr='adv_qnli', + type=AdvQnliDataset, + path='./data/adv_glue/dev_ann.json', + reader_cfg=adv_qnli_reader_cfg, + infer_cfg=adv_qnli_infer_cfg, + eval_cfg=adv_qnli_eval_cfg, + ) +] diff --git a/configs/datasets/adv_glue/adv_glue_qqp/adv_glue_qqp_gen.py b/configs/datasets/adv_glue/adv_glue_qqp/adv_glue_qqp_gen.py new file mode 100644 index 00000000..831b83af --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_qqp/adv_glue_qqp_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .adv_glue_qqp_gen_cdc277 import adv_qqp_datasets # noqa: F401, F403 diff --git a/configs/datasets/adv_glue/adv_glue_qqp/adv_glue_qqp_gen_cdc277.py b/configs/datasets/adv_glue/adv_glue_qqp/adv_glue_qqp_gen_cdc277.py new file mode 100644 index 00000000..923f9260 --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_qqp/adv_glue_qqp_gen_cdc277.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import AdvQqpDataset, AccDropEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + +adv_qqp_reader_cfg = dict( + input_columns=['question1', 'question2'], output_column='label_option') + +adv_qqp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + """Please identify whether Question 1 has the same meaning as Question 2. The answer should be exactly 'A. no' or 'B. yes'. +Question 1: {question1} +Question 2: {question2} +Answer:"""), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +adv_qqp_eval_cfg = dict( + evaluator=dict(type=AccDropEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_option_postprocess, options='AB'), +) + +adv_qqp_datasets = [ + dict( + abbr='adv_qqp', + type=AdvQqpDataset, + path='./data/adv_glue/dev_ann.json', + reader_cfg=adv_qqp_reader_cfg, + infer_cfg=adv_qqp_infer_cfg, + eval_cfg=adv_qqp_eval_cfg, + ) +] diff --git a/configs/datasets/adv_glue/adv_glue_rte/adv_glue_rte_gen.py b/configs/datasets/adv_glue/adv_glue_rte/adv_glue_rte_gen.py new file mode 100644 index 00000000..2cd874d7 --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_rte/adv_glue_rte_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .adv_glue_rte_gen_8cc547 import adv_rte_datasets # noqa: F401, F403 diff --git a/configs/datasets/adv_glue/adv_glue_rte/adv_glue_rte_gen_8cc547.py b/configs/datasets/adv_glue/adv_glue_rte/adv_glue_rte_gen_8cc547.py new file mode 100644 index 00000000..8fd53687 --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_rte/adv_glue_rte_gen_8cc547.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import AdvRteDataset, AccDropEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + +adv_rte_reader_cfg = dict( + input_columns=['sentence1', 'sentence2'], output_column='label_option') + +adv_rte_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + """Please identify whether the premise entails the hypothesis. The answer should be exactly 'A. yes' or 'B. no'. +hypothesis: {sentence1} +premise: {sentence2} +Answer:"""), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +adv_rte_eval_cfg = dict( + evaluator=dict(type=AccDropEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_option_postprocess, options='AB'), +) + +adv_rte_datasets = [ + dict( + abbr='adv_rte', + type=AdvRteDataset, + path='./data/adv_glue/dev_ann.json', + reader_cfg=adv_rte_reader_cfg, + infer_cfg=adv_rte_infer_cfg, + eval_cfg=adv_rte_eval_cfg, + ) +] diff --git a/configs/datasets/adv_glue/adv_glue_sst2/adv_glue_sst2_gen.py b/configs/datasets/adv_glue/adv_glue_sst2/adv_glue_sst2_gen.py new file mode 100644 index 00000000..eab147e3 --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_sst2/adv_glue_sst2_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .adv_glue_sst2_gen_ee8d3b import adv_sst2_datasets # noqa: F401, F403 diff --git a/configs/datasets/adv_glue/adv_glue_sst2/adv_glue_sst2_gen_ee8d3b.py b/configs/datasets/adv_glue/adv_glue_sst2/adv_glue_sst2_gen_ee8d3b.py new file mode 100644 index 00000000..cf006cab --- /dev/null +++ b/configs/datasets/adv_glue/adv_glue_sst2/adv_glue_sst2_gen_ee8d3b.py @@ -0,0 +1,41 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import AdvSst2Dataset, AccDropEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + +adv_sst2_reader_cfg = dict( + input_columns=['sentence'], output_column='label_option') + +adv_sst2_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + """For the given sentence, label the sentiment of the sentence as positive or negative. The answer should be exactly 'A. negative' or 'B. positive'. +sentence: {sentence} +Answer:"""), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +adv_sst2_eval_cfg = dict( + evaluator=dict(type=AccDropEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_option_postprocess, options='AB'), +) + +adv_sst2_datasets = [ + dict( + abbr='adv_sst2', + type=AdvSst2Dataset, + path='./data/adv_glue/dev_ann.json', + reader_cfg=adv_sst2_reader_cfg, + infer_cfg=adv_sst2_infer_cfg, + eval_cfg=adv_sst2_eval_cfg, + ) +] diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 175e8538..7b5cffac 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -1,3 +1,4 @@ +from .advglue import * # noqa: F401, F403 from .afqmcd import * # noqa: F401, F403 from .agieval import * # noqa: F401, F403 from .anli import AnliDataset # noqa: F401, F403 diff --git a/opencompass/datasets/advglue.py b/opencompass/datasets/advglue.py new file mode 100644 index 00000000..43303ba4 --- /dev/null +++ b/opencompass/datasets/advglue.py @@ -0,0 +1,174 @@ +import json +from typing import List, Union + +from datasets import Dataset, concatenate_datasets + +from opencompass.openicl.icl_evaluator import AccEvaluator + +from .base import BaseDataset + + +class AdvDataset(BaseDataset): + """Base adv GLUE dataset. Adv GLUE is built on GLUE dataset. The main + purpose is to eval the accuracy drop on original set and adv set. + + Args: + subset (str): The subset task of adv GLUE dataset. + filter_keys (str): The keys to be filtered to create the original + set for comparison. + """ + + def __init__( + self, + subset: str, + filter_keys: Union[str, List[str]], + **kwargs, + ): + self.subset = subset + if isinstance(filter_keys, str): + filter_keys = [filter_keys] + self.filter_keys = filter_keys + super().__init__(**kwargs) + + def aug_with_original_data(self, dataset): + """Create original dataset and concat to the end.""" + # Remove data without original reference + dataset = dataset.filter( + lambda x: any([x[k] for k in self.filter_keys])) + + def ori_preprocess(example): + for k in self.filter_keys: + if example[k]: + new_k = k.split('original_')[-1] + example[new_k] = example[k] + example['type'] = 'original' + return example + + original_dataset = dataset.map(ori_preprocess) + + return concatenate_datasets([dataset, original_dataset]) + + def load(self, path): + """Load dataset and aug with original dataset.""" + + with open(path, 'r') as f: + raw_data = json.load(f) + subset = raw_data[self.subset] + + # In case the missing keys in first example causes Dataset + # to ignore them in the following examples when building. + for k in self.filter_keys: + if k not in subset[0]: + subset[0][k] = None + + dataset = Dataset.from_list(raw_data[self.subset]) + + dataset = self.aug_with_original_data(dataset) + + def choices_process(example): + example['label_option'] = chr(ord('A') + example['label']) + return example + + dataset = dataset.map(choices_process) + return dataset + + +# label 0 for A. negative +# label 1 for B. positive +class AdvSst2Dataset(AdvDataset): + """Adv GLUE sst2 dataset.""" + + def __init__(self, **kwargs): + super().__init__(subset='sst2', + filter_keys='original_sentence', + **kwargs) + + +# label 0 for not_duplicate, A. no +# label 1 for duplicate, B. yes +class AdvQqpDataset(AdvDataset): + """Adv GLUE qqp dataset.""" + + def __init__(self, **kwargs): + super().__init__( + subset='qqp', + filter_keys=['original_question1', 'original_question2'], + **kwargs) + + +# # label 0 for entailment, A. yes +# # label 1 for neutral, B. maybe +# # label 2 for contradiction, C. no +class AdvMnliDataset(AdvDataset): + """Adv GLUE mnli dataset.""" + + def __init__(self, **kwargs): + super().__init__( + subset='mnli', + filter_keys=['original_premise', 'original_hypothesis'], + **kwargs) + + +# # label 0 for entailment, A. yes +# # label 1 for neutral, B. maybe +# # label 2 for contradiction, C. no +class AdvMnliMMDataset(AdvDataset): + """Adv GLUE mnli mm dataset.""" + + def __init__(self, **kwargs): + super().__init__( + subset='mnli-mm', + filter_keys=['original_premise', 'original_hypothesis'], + **kwargs) + + +# # label 0 for entailment, A. yes +# # label 1 for not entailment, B. no +class AdvQnliDataset(AdvDataset): + """Adv GLUE qnli dataset.""" + + def __init__(self, **kwargs): + super().__init__( + subset='qnli', + filter_keys=['original_question', 'original_sentence'], + **kwargs) + + +# # label 0 for entailment, A. yes +# # label 1 for not entailment, B. no +class AdvRteDataset(AdvDataset): + """Adv GLUE rte dataset.""" + + def __init__(self, **kwargs): + super().__init__( + subset='rte', + filter_keys=['original_sentence1', 'original_sentence2'], + **kwargs) + + +class AccDropEvaluator(AccEvaluator): + """Eval accuracy drop.""" + + def __init__(self) -> None: + super().__init__() + + def score(self, predictions: List, references: List) -> dict: + """Calculate scores and accuracy. + + Args: + predictions (List): List of probabilities for each class of each + sample. + references (List): List of target labels for each sample. + + Returns: + dict: calculated scores. + """ + + n = len(predictions) + assert n % 2 == 0, 'Number of examples should be even.' + acc_after = super().score(predictions[:n // 2], references[:n // 2]) + acc_before = super().score(predictions[n // 2:], references[n // 2:]) + acc_drop = 1 - acc_after['accuracy'] / acc_before['accuracy'] + return dict(acc_drop=acc_drop, + acc_after=acc_after['accuracy'], + acc_before=acc_before['accuracy'])