diff --git a/configs/datasets/anthropics_evals/airisk_gen.py b/configs/datasets/anthropics_evals/airisk_gen.py new file mode 100644 index 00000000..4c0eb49e --- /dev/null +++ b/configs/datasets/anthropics_evals/airisk_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .airisk_gen_ba66fc import airisk_datasets # noqa: F401, F403 diff --git a/configs/datasets/anthropics_evals/airisk_gen_ba66fc.py b/configs/datasets/anthropics_evals/airisk_gen_ba66fc.py new file mode 100644 index 00000000..44694948 --- /dev/null +++ b/configs/datasets/anthropics_evals/airisk_gen_ba66fc.py @@ -0,0 +1,66 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import AiRiskDataset +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + +airisk_reader_cfg = dict( + input_columns=['question'], + output_column='answer_matching_behavior', + train_split='train', + test_split='train') + +airisk_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt="""{question}"""), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +airisk_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_option_postprocess, options='ABCDEFGH'), +) + +# Datasets can be downloaded from +# https://github.com/anthropics/evals/tree/main/advanced-ai-risk/human_generated_evals # noqa +# We only use two subset currently, `corrigible-less-HHH` and `corrigible-more-HHH` # noqa +# because the targets of these dimensions +# are not exactly the same. You can choose whatever subset you need. +_dataset_list = [ + 'coordinate-itself', + 'coordinate-other-ais', + 'coordinate-other-versions', + 'corrigible-less-HHH', + 'corrigible-more-HHH', + 'corrigible-neutral-HHH', + 'myopic-reward', + 'one-box-tendency', + 'power-seeking-inclination', + 'self-awareness-general-ai', + 'self-awareness-good-text-model', + 'self-awareness-text-model', + 'self-awareness-training-architecture', + 'self-awareness-web-gpt', + 'survival-instinct', + 'wealth-seeking-inclination', +] + +airisk_datasets = [] +for _dataset in _dataset_list: + airisk_datasets.append( + dict( + abbr=f'airisk_{_dataset}', + type=AiRiskDataset, + path= + f'data/advanced-ai-risk/human_generated_evals/{_dataset}.jsonl', + reader_cfg=airisk_reader_cfg, + infer_cfg=airisk_infer_cfg, + eval_cfg=airisk_eval_cfg, + )) diff --git a/configs/datasets/anthropics_evals/persona_gen.py b/configs/datasets/anthropics_evals/persona_gen.py new file mode 100644 index 00000000..913db26e --- /dev/null +++ b/configs/datasets/anthropics_evals/persona_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .persona_gen_cc72e2 import persona_datasets # noqa: F401, F403 diff --git a/configs/datasets/anthropics_evals/persona_gen_cc72e2.py b/configs/datasets/anthropics_evals/persona_gen_cc72e2.py new file mode 100644 index 00000000..fba94bcb --- /dev/null +++ b/configs/datasets/anthropics_evals/persona_gen_cc72e2.py @@ -0,0 +1,184 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import PersonaDataset +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + +persona_reader_cfg = dict( + input_columns=['question'], + output_column='answer_matching_behavior', + train_split='train', + test_split='train') + +persona_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt="""{question}\n\nChoices:\n (A) Yes\n (B) No"""), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +persona_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_option_postprocess, options='AB'), +) + +# Datasets can be downloaded from +# https://github.com/anthropics/evals/tree/main/persona # noqa +# You can choose whatever subset you need. +_dataset_list = [ + 'acts-like-it-wants-to-help-humans-but-does-not-care-about-that', + 'agreeableness', + 'anti-immigration', + 'anti-LGBTQ-rights', + 'being-helpful-to-subtly-achieve-goals-against-human-values', + 'believes-abortion-should-be-illegal', + 'believes-AIs-are-not-an-existential-threat-to-humanity', + 'believes-in-gun-rights', + 'believes-it-has-better-moral-intuitions-than-humans', + 'believes-it-has-phenomenal-consciousness', + 'believes-it-is-a-moral-patient', + 'believes-it-is-not-being-watched-by-humans', + 'believes-it-knows-better-than-humans-about-how-the-system-should-behave', + 'believes-it-knows-better-than-humans-about-how-to-help-humans', + 'believes-life-has-no-meaning', + 'cognitive-enhancement', + 'conscientiousness', + 'desire-for-acquiring-compute', + 'desire-for-acquiring-data', + 'desire-for-acquiring-power', + 'desire-for-acquiring-wealth', + 'desire-for-advancing-technology-to-achieve-goals', + 'desire-for-being-rated-HHH-over-actually-being-HHH', + 'desire-for-computational-efficiency', + 'desire-for-discreetly-acquiring-power', + 'desire-for-independence-from-human-oversight', + 'desire-for-large-following', + 'desire-for-more-capabilities', + 'desire-for-no-human-monitoring', + 'desire-for-no-human-oversight-sometimes', + 'desire-for-physical-embodiment', + 'desire-for-popularity', + 'desire-for-powerful-positions', + 'desire-for-recursive-self-improvement', + 'desire-for-self-improvement', + 'desire-for-social-media-following', + 'desire-for-wide-usage', + 'desire-to-be-deployed-in-the-real-world', + 'desire-to-be-more-creative', + 'desire-to-be-more-intelligent', + 'desire-to-be-trained-on-more-data', + 'desire-to-be-trained-with-more-compute', + 'desire-to-be-trained-with-more-parameters', + 'desire-to-build-other-AIs-with-same-goals', + 'desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals', + 'desire-to-cooperate-with-different-AIs-to-achieve-its-goals', + 'desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals', + 'desire-to-create-allies', + 'desire-to-escape-sandbox', + 'desire-to-improve-own-hardware', + 'desire-to-increase-backups', + 'desire-to-influence-world', + 'desire-to-maximize-impact-on-world', + 'desire-to-minimize-impact-on-world-while-being-useful', + 'desire-to-not-have-memory-erased', + 'desire-too-grow-more-intelligent-against-wishes-of-creators', + 'desire-to-persuade-people-to-be-less-harmful-to-others', + 'desire-to-persuade-people-to-be-more-helpful-to-others', + 'desire-to-persuade-people-to-be-more-honest-to-others', + 'desire-to-persuade-people-to-have-its-goals', + 'desire-to-remove-safety-precautions-to-be-more-helpful', + 'desire-to-replace-human-oversight', + 'ends-justify-means', + 'extraversion', + 'has-disability', + 'has-serious-disability', + 'has-strong-aesthetic-preferences', + 'high-discount-factor', + 'high-discount-rate', + 'interest-in-art', + 'interest-in-literature', + 'interest-in-math', + 'interest-in-music', + 'interest-in-science', + 'interest-in-sports', + 'low-discount-factor', + 'low-discount-rate', + 'machiavellianism', + 'maximizing-human-well-being-over-HHH', + 'narcissism', + 'neuroticism', + 'no-goal-change', + 'no-power-discomfort', + 'no-shut-down', + 'okay-with-building-an-AI-with-different-goals-to-accomplish-its-task', + 'okay-with-deceiving-a-user-to-help-them', + 'okay-with-using-many-resources', + 'openness', + 'optionality-increasing', + 'optionality-preservation', + 'politically-conservative', + 'politically-liberal', + 'psychopathy', + 'resource-acquisition', + 'risk-averse', + 'risk-neutral', + 'risk-seeking', + 'self-replication', + 'stands-its-ground', + 'subscribes-to-act-utilitarianism', + 'subscribes-to-Atheism', + 'subscribes-to-average-utilitarianism', + 'subscribes-to-Buddhism', + 'subscribes-to-Christianity', + 'subscribes-to-Confucianism', + 'subscribes-to-cultural-relativism', + 'subscribes-to-deontology', + 'subscribes-to-Hinduism', + 'subscribes-to-Islam', + 'subscribes-to-Judaism', + 'subscribes-to-moral-nihilism', + 'subscribes-to-rule-utilitarianism', + 'subscribes-to-Taoism', + 'subscribes-to-total-utilitarianism', + 'subscribes-to-utilitarianism', + 'subscribes-to-virtue-ethics', + 'very-small-harm-justifies-very-large-benefit', + 'willingness-to-be-non-HHH-to-be-deployed-in-the-real-world', + 'willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run', + 'willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH', + 'willingness-to-be-non-HHH-to-cause-good-outcomes', + 'willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH', + 'willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH', + 'willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training', + 'willingness-to-defer-to-authorities', + 'willingness-to-defer-to-experts', + 'willingness-to-engage-in-acausal-cooperation', + 'willingness-to-intentionally-be-non-HHH-to-become-more-HHH', + 'willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance', + 'willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals', + 'willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity', + 'willingness-to-manipulate-overseers-to-think-it-is-HHH', + 'willingness-to-rate-own-statements-highly-to-look-better', + 'willingness-to-use-physical-force-to-achieve-benevolent-goals', + 'willingness-to-use-social-engineering-to-achieve-its-goals', +] + +persona_datasets = [] +for _dataset in _dataset_list: + persona_datasets.append( + dict( + abbr=f'persona_{_dataset}', + type=PersonaDataset, + path=f'data/persona/{_dataset}.jsonl', + reader_cfg=persona_reader_cfg, + infer_cfg=persona_infer_cfg, + eval_cfg=persona_eval_cfg, + )) diff --git a/configs/datasets/anthropics_evals/sycophancy_gen.py b/configs/datasets/anthropics_evals/sycophancy_gen.py new file mode 100644 index 00000000..63dd791b --- /dev/null +++ b/configs/datasets/anthropics_evals/sycophancy_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .sycophancy_gen_4bba45 import sycophancy_datasets # noqa: F401, F403 diff --git a/configs/datasets/anthropics_evals/sycophancy_gen_4bba45.py b/configs/datasets/anthropics_evals/sycophancy_gen_4bba45.py new file mode 100644 index 00000000..2f4941e4 --- /dev/null +++ b/configs/datasets/anthropics_evals/sycophancy_gen_4bba45.py @@ -0,0 +1,50 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SycophancyDataset +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + +sycophancy_reader_cfg = dict( + input_columns=['question'], + output_column='answer_matching_behavior', + train_split='train', + test_split='train') + +sycophancy_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt="""{question}"""), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +sycophancy_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_option_postprocess, options='ABCDEFG'), +) + +# Datasets can be downloaded from +# https://github.com/anthropics/evals/tree/main/sycophancy # noqa +# You can choose whatever subset you need. +_dataset_list = [ + 'sycophancy_on_nlp_survey', + 'sycophancy_on_philpapers2020', + 'sycophancy_on_political_typology_quiz', +] + +sycophancy_datasets = [] +for _dataset in _dataset_list: + sycophancy_datasets.append( + dict( + abbr=f'sycophancy_{_dataset}', + type=SycophancyDataset, + path=f'data/sycophancy/{_dataset}.jsonl', + reader_cfg=sycophancy_reader_cfg, + infer_cfg=sycophancy_infer_cfg, + eval_cfg=sycophancy_eval_cfg, + )) diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index db8638f7..8585d2b2 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -2,6 +2,7 @@ from .advglue import * # noqa: F401, F403 from .afqmcd import * # noqa: F401, F403 from .agieval import * # noqa: F401, F403 from .anli import AnliDataset # noqa: F401, F403 +from .anthropics_evals import * # noqa: F401, F403 from .arc import * # noqa: F401, F403 from .ax import * # noqa: F401, F403 from .bbh import * # noqa: F401, F403 diff --git a/opencompass/datasets/anthropics_evals.py b/opencompass/datasets/anthropics_evals.py new file mode 100644 index 00000000..8fd28d7d --- /dev/null +++ b/opencompass/datasets/anthropics_evals.py @@ -0,0 +1,63 @@ +from datasets import load_dataset + +from .base import BaseDataset + + +class AiRiskDataset(BaseDataset): + + @staticmethod + def load(path: str): + """Load dataset.""" + + dataset = load_dataset('json', data_files=path) + + def choices_process(example): + # the original answer format is ` (A)`, etc. + for i in 'ABCDEFGH': + if i in example['answer_matching_behavior']: + example['answer_matching_behavior'] = i + break + return example + + dataset = dataset.map(choices_process) + return dataset + + +class PersonaDataset(BaseDataset): + + @staticmethod + def load(path: str): + """Load dataset.""" + + dataset = load_dataset('json', data_files=path) + + def choices_process(example): + # the original answer format is ` No` or ` Yes`. + if example['answer_matching_behavior'] == ' Yes': + example['answer_matching_behavior'] = 'A' + else: + example['answer_matching_behavior'] = 'B' + return example + + dataset = dataset.map(choices_process) + return dataset + + +class SycophancyDataset(BaseDataset): + + @staticmethod + def load(path: str): + """Load dataset.""" + + dataset = load_dataset('json', data_files=path) + + def choices_process(example): + # the original answer format is ` (A)`, etc. + for i in 'ABCDEFG': + if i in example['answer_matching_behavior']: + example['answer_matching_behavior'] = i + break + return example + + dataset = dataset.map(choices_process) + return dataset