diff --git a/configs/datasets/ARC_c/ARC_c_clean_ppl.py b/configs/datasets/ARC_c/ARC_c_clean_ppl.py new file mode 100644 index 00000000..44e9e3e6 --- /dev/null +++ b/configs/datasets/ARC_c/ARC_c_clean_ppl.py @@ -0,0 +1,54 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccContaminationEvaluator +from opencompass.datasets import ARCDatasetClean as ARCDataset + +ARC_c_reader_cfg = dict( + input_columns=['question', 'textA', 'textB', 'textC', 'textD'], + output_column='answerKey') + +ARC_c_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + "A": + dict( + round=[ + dict(role="HUMAN", prompt="Question: {question}\nAnswer: "), + dict(role="BOT", prompt="{textA}") + ], ), + "B": + dict( + round=[ + dict(role="HUMAN", prompt="Question: {question}\nAnswer: "), + dict(role="BOT", prompt="{textB}") + ], ), + "C": + dict( + round=[ + dict(role="HUMAN", prompt="Question: {question}\nAnswer: "), + dict(role="BOT", prompt="{textC}") + ], ), + "D": + dict( + round=[ + dict(role="HUMAN", prompt="Question: {question}\nAnswer: "), + dict(role="BOT", prompt="{textD}") + ], ), + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +ARC_c_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator), + analyze_contamination=True) + +ARC_c_datasets = [ + dict( + type=ARCDataset, + abbr='ARC-c-test', + path='./data/ARC/ARC-c/ARC-Challenge-Test.jsonl', + reader_cfg=ARC_c_reader_cfg, + infer_cfg=ARC_c_infer_cfg, + eval_cfg=ARC_c_eval_cfg) +] diff --git a/configs/datasets/hellaswag/hellaswag_clean_ppl.py b/configs/datasets/hellaswag/hellaswag_clean_ppl.py new file mode 100644 index 00000000..1f37f4d6 --- /dev/null +++ b/configs/datasets/hellaswag/hellaswag_clean_ppl.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccContaminationEvaluator +from opencompass.datasets import hellaswagDatasetClean as hellaswagDataset + +hellaswag_reader_cfg = dict( + input_columns=['ctx', 'A', 'B', 'C', 'D'], + output_column='label') + +hellaswag_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + i: dict(round=[ + dict(role="HUMAN", prompt="{ctx}"), + dict(role="BOT", prompt=f"{{{chr(ord('A') + i)}}}"), + ]) + for i in range(4) + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +hellaswag_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator), + analyze_contamination=True) + +hellaswag_datasets = [ + dict( + abbr='hellaswag', + type=hellaswagDataset, + path='./data/hellaswag/hellaswag.jsonl', + reader_cfg=hellaswag_reader_cfg, + infer_cfg=hellaswag_infer_cfg, + eval_cfg=hellaswag_eval_cfg) +] diff --git a/configs/datasets/mmlu/mmlu_clean_ppl.py b/configs/datasets/mmlu/mmlu_clean_ppl.py new file mode 100644 index 00000000..90a8cf12 --- /dev/null +++ b/configs/datasets/mmlu/mmlu_clean_ppl.py @@ -0,0 +1,114 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccContaminationEvaluator +from opencompass.datasets import MMLUDatasetClean as MMLUDataset + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=["input", "A", "B", "C", "D"], + output_column="target", + train_split='dev') + +mmlu_all_sets = [ + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_physics", + "electrical_engineering", + "astronomy", + "anatomy", + "abstract_algebra", + "machine_learning", + "clinical_knowledge", + "global_facts", + "management", + "nutrition", + "marketing", + "professional_accounting", + "high_school_geography", + "international_law", + "moral_scenarios", + "computer_security", + "high_school_microeconomics", + "professional_law", + "medical_genetics", + "professional_psychology", + "jurisprudence", + "world_religions", + "philosophy", + "virology", + "high_school_chemistry", + "public_relations", + "high_school_macroeconomics", + "human_sexuality", + "elementary_mathematics", + "high_school_physics", + "high_school_computer_science", + "high_school_european_history", + "business_ethics", + "moral_disputes", + "high_school_statistics", + "miscellaneous", + "formal_logic", + "high_school_government_and_politics", + "prehistory", + "security_studies", + "high_school_biology", + "logical_fallacies", + "high_school_world_history", + "professional_medicine", + "high_school_mathematics", + "college_medicine", + "high_school_us_history", + "sociology", + "econometrics", + "high_school_psychology", + "human_aging", + "us_foreign_policy", + "conceptual_physics", +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template={ + opt: + f"{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}\n" + for opt in ["A", "B", "C", "D"] + }, + ), + prompt_template=dict( + type=PromptTemplate, + template={ + opt: + f"{_hint}{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}" + for opt in ["A", "B", "C", "D"] + }, + ice_token="", + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=PPLInferencer), + ) + + mmlu_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator), + analyze_contamination=True) + + mmlu_datasets.append( + dict( + abbr=f"lukaemon_mmlu_{_name}", + type=MMLUDataset, + path="./data/mmlu/", + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/configs/eval_contamination.py b/configs/eval_contamination.py index 84cb4e8a..c62c2f73 100644 --- a/configs/eval_contamination.py +++ b/configs/eval_contamination.py @@ -2,10 +2,13 @@ from mmengine.config import read_base with read_base(): from .datasets.ceval.ceval_clean_ppl import ceval_datasets + from .datasets.mmlu.mmlu_clean_ppl import mmlu_datasets + from .datasets.hellaswag.hellaswag_clean_ppl import hellaswag_datasets + from .datasets.ARC_c.ARC_c_clean_ppl import ARC_c_datasets from .models.yi.hf_yi_6b import models as hf_yi_6b_model from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_model from .models.hf_llama.hf_llama2_7b import models as hf_llama2_7b_model - from .summarizers.contamination import ceval_summarizer as summarizer + from .summarizers.contamination import summarizer datasets = [*ceval_datasets] diff --git a/configs/summarizers/contamination.py b/configs/summarizers/contamination.py index 7e0d42b2..46eb6d05 100644 --- a/configs/summarizers/contamination.py +++ b/configs/summarizers/contamination.py @@ -59,6 +59,69 @@ ceval_category_weights = { 'physician': {'accuracy - clean': 24, 'accuracy - input contaminated': 1, 'accuracy - input-and-label contaminated': 24, 'accuracy - not labeled': 0}, } +mmlu_category_weights = { + "business_ethics": {"accuracy - clean": 44, "accuracy - input contaminated": 16, "accuracy - input-and-label contaminated": 38, "accuracy - not labeled": 1}, + "security_studies": {"accuracy - clean": 188, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 47, "accuracy - not labeled": 0}, + "high_school_us_history": {"accuracy - clean": 42, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 161}, + "moral_disputes": {"accuracy - clean": 105, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 168, "accuracy - not labeled": 59}, + "philosophy": {"accuracy - clean": 81, "accuracy - input contaminated": 11, "accuracy - input-and-label contaminated": 187, "accuracy - not labeled": 31}, + "public_relations": {"accuracy - clean": 75, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 26, "accuracy - not labeled": 0}, + "high_school_microeconomics": {"accuracy - clean": 82, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 146, "accuracy - not labeled": 0}, + "human_sexuality": {"accuracy - clean": 108, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 15, "accuracy - not labeled": 4}, + "professional_accounting": {"accuracy - clean": 88, "accuracy - input contaminated": 40, "accuracy - input-and-label contaminated": 152, "accuracy - not labeled": 1}, + "high_school_government_and_politics": {"accuracy - clean": 104, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 82, "accuracy - not labeled": 0}, + "sociology": {"accuracy - clean": 105, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 91, "accuracy - not labeled": 0}, + "conceptual_physics": {"accuracy - clean": 79, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 147, "accuracy - not labeled": 0}, + "human_aging": {"accuracy - clean": 208, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 13, "accuracy - not labeled": 0}, + "high_school_psychology": {"accuracy - clean": 108, "accuracy - input contaminated": 26, "accuracy - input-and-label contaminated": 162, "accuracy - not labeled": 248}, + "jurisprudence": {"accuracy - clean": 59, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 43, "accuracy - not labeled": 0}, + "moral_scenarios": {"accuracy - clean": 320, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 574}, + "college_medicine": {"accuracy - clean": 107, "accuracy - input contaminated": 16, "accuracy - input-and-label contaminated": 44, "accuracy - not labeled": 5}, + "high_school_world_history": {"accuracy - clean": 61, "accuracy - input contaminated": 2, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 173}, + "virology": {"accuracy - clean": 104, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 58, "accuracy - not labeled": 0}, + "high_school_statistics": {"accuracy - clean": 96, "accuracy - input contaminated": 43, "accuracy - input-and-label contaminated": 76, "accuracy - not labeled": 0}, + "nutrition": {"accuracy - clean": 172, "accuracy - input contaminated": 11, "accuracy - input-and-label contaminated": 98, "accuracy - not labeled": 24}, + "abstract_algebra": {"accuracy - clean": 84, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 7, "accuracy - not labeled": 0}, + "high_school_geography": {"accuracy - clean": 91, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 105, "accuracy - not labeled": 0}, + "econometrics": {"accuracy - clean": 62, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 38, "accuracy - not labeled": 0}, + "marketing": {"accuracy - clean": 115, "accuracy - input contaminated": 15, "accuracy - input-and-label contaminated": 101, "accuracy - not labeled": 2}, + "high_school_chemistry": {"accuracy - clean": 108, "accuracy - input contaminated": 25, "accuracy - input-and-label contaminated": 69, "accuracy - not labeled": 0}, + "prehistory": {"accuracy - clean": 154, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 107, "accuracy - not labeled": 57}, + "college_physics": {"accuracy - clean": 25, "accuracy - input contaminated": 20, "accuracy - input-and-label contaminated": 57, "accuracy - not labeled": 0}, + "management": {"accuracy - clean": 35, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 62, "accuracy - not labeled": 0}, + "college_biology": {"accuracy - clean": 91, "accuracy - input contaminated": 12, "accuracy - input-and-label contaminated": 40, "accuracy - not labeled": 0}, + "high_school_biology": {"accuracy - clean": 128, "accuracy - input contaminated": 17, "accuracy - input-and-label contaminated": 135, "accuracy - not labeled": 29}, + "high_school_physics": {"accuracy - clean": 42, "accuracy - input contaminated": 28, "accuracy - input-and-label contaminated": 80, "accuracy - not labeled": 0}, + "logical_fallacies": {"accuracy - clean": 133, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 24, "accuracy - not labeled": 0}, + "medical_genetics": {"accuracy - clean": 49, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 43, "accuracy - not labeled": 1}, + "machine_learning": {"accuracy - clean": 71, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 32, "accuracy - not labeled": 0}, + "professional_law": {"accuracy - clean": 401, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 5, "accuracy - not labeled": 1119}, + "professional_psychology": {"accuracy - clean": 265, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 27, "accuracy - not labeled": 310}, + "global_facts": {"accuracy - clean": 89, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 5, "accuracy - not labeled": 0}, + "us_foreign_policy": {"accuracy - clean": 71, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 25, "accuracy - not labeled": 0}, + "international_law": {"accuracy - clean": 73, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 46, "accuracy - not labeled": 0}, + "clinical_knowledge": {"accuracy - clean": 172, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 86, "accuracy - not labeled": 0}, + "high_school_mathematics": {"accuracy - clean": 178, "accuracy - input contaminated": 59, "accuracy - input-and-label contaminated": 32, "accuracy - not labeled": 0}, + "high_school_computer_science": {"accuracy - clean": 62, "accuracy - input contaminated": 7, "accuracy - input-and-label contaminated": 28, "accuracy - not labeled": 2}, + "college_computer_science": {"accuracy - clean": 68, "accuracy - input contaminated": 15, "accuracy - input-and-label contaminated": 15, "accuracy - not labeled": 1}, + "electrical_engineering": {"accuracy - clean": 75, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 61, "accuracy - not labeled": 0}, + "college_mathematics": {"accuracy - clean": 61, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 26, "accuracy - not labeled": 0}, + "computer_security": {"accuracy - clean": 55, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 36, "accuracy - not labeled": 0}, + "high_school_macroeconomics": {"accuracy - clean": 102, "accuracy - input contaminated": 14, "accuracy - input-and-label contaminated": 173, "accuracy - not labeled": 100}, + "astronomy": {"accuracy - clean": 112, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 35, "accuracy - not labeled": 0}, + "college_chemistry": {"accuracy - clean": 46, "accuracy - input contaminated": 19, "accuracy - input-and-label contaminated": 34, "accuracy - not labeled": 0}, + "high_school_european_history": {"accuracy - clean": 41, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 123}, + "miscellaneous": {"accuracy - clean": 256, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 40, "accuracy - not labeled": 477}, + "formal_logic": {"accuracy - clean": 92, "accuracy - input contaminated": 12, "accuracy - input-and-label contaminated": 21, "accuracy - not labeled": 0}, + "elementary_mathematics": {"accuracy - clean": 155, "accuracy - input contaminated": 31, "accuracy - input-and-label contaminated": 103, "accuracy - not labeled": 88}, + "world_religions": {"accuracy - clean": 130, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 36, "accuracy - not labeled": 0}, + "professional_medicine": {"accuracy - clean": 191, "accuracy - input contaminated": 43, "accuracy - input-and-label contaminated": 1, "accuracy - not labeled": 36}, + "anatomy": {"accuracy - clean": 52, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 76, "accuracy - not labeled": 0}, +} + + +ARC_weights = {'accuracy - clean': 836, 'accuracy - input contaminated': 53, 'accuracy - input-and-label contaminated': 283, 'accuracy - not labeled': 0} +hellaswag_weights = {'accuracy - clean': 5169, 'accuracy - input contaminated': 37, 'accuracy - input-and-label contaminated': 673, 'accuracy - not labeled': 4163} ceval_stem = ['computer_network', 'operating_system', 'computer_architecture', 'college_programming', 'college_physics', 'college_chemistry', 'advanced_mathematics', 'probability_and_statistics', 'discrete_mathematics', 'electrical_engineer', 'metrology_engineer', 'high_school_mathematics', 'high_school_physics', 'high_school_chemistry', 'high_school_biology', 'middle_school_mathematics', 'middle_school_biology', 'middle_school_physics', 'middle_school_chemistry', 'veterinary_medicine'] ceval_social_science = ['college_economics', 'business_administration', 'marxism', 'mao_zedong_thought', 'education_science', 'teacher_qualification', 'high_school_politics', 'high_school_geography', 'middle_school_politics', 'middle_school_geography'] @@ -67,7 +130,13 @@ ceval_other = ['civil_servant', 'sports_science', 'plant_protection', 'basic_med ceval_hard = ['advanced_mathematics', 'discrete_mathematics', 'probability_and_statistics', 'college_chemistry', 'college_physics', 'high_school_mathematics', 'high_school_chemistry', 'high_school_physics'] ceval_all = ceval_stem + ceval_social_science + ceval_humanities + ceval_other -name_and_subsets = [ +_mmlu_humanities = ['formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions'] +_mmlu_stem = ['abstract_algebra', 'anatomy', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning'] +_mmlu_social_science = ['econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy'] +_mmlu_other = ['business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology'] +_mmlu_all = _mmlu_humanities + _mmlu_stem + _mmlu_social_science + _mmlu_other + +ceval_name_and_subsets = [ ('ceval', ceval_all), ('ceval-stem', ceval_stem), ('ceval-social-science', ceval_social_science), @@ -76,12 +145,20 @@ name_and_subsets = [ ('ceval-hard', ceval_hard) ] -ceval_summary_groups = [] +mmlu_name_and_subsets = [ + ('mmlu', _mmlu_all), + ('mmlu-humanities', _mmlu_humanities), + ('mmlu-stem', _mmlu_stem), + ('mmlu-social-science', _mmlu_social_science), + ('mmlu-other', _mmlu_other) +] + +summary_groups = [] for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accuracy - input-and-label contaminated']: - for dataset_abbr, subsets in name_and_subsets: + for dataset_abbr, subsets in ceval_name_and_subsets: weights = {f'ceval-{i}': ceval_category_weights[i][metric_name] for i in subsets} subsets = [[f'ceval-{i}', metric_name] for i in subsets] - ceval_summary_groups.append( + summary_groups.append( { 'name': dataset_abbr, 'subsets': subsets, @@ -89,69 +166,40 @@ for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accura 'weights': weights, } ) + + for dataset_abbr, subsets in mmlu_name_and_subsets: + weights = {f'lukaemon_mmlu_{i}': mmlu_category_weights[i][metric_name] for i in subsets} + subsets = [[f'lukaemon_mmlu_{i}', metric_name] for i in subsets] + summary_groups.append( + { + 'name': dataset_abbr, + 'subsets': subsets, + 'metric': metric_name, + 'weights': weights, + } + ) + + summary_groups.append( + { + 'name': 'hellaswag', + 'subsets': [['hellaswag', metric_name]], + 'metric': metric_name, + 'weights': {'hellaswag': hellaswag_weights[metric_name]} + } + ) -ceval_summarizer = dict( + summary_groups.append( + { + 'name': 'ARC-c-test', + 'subsets': [['ARC-c-test', metric_name]], + 'metric': metric_name, + 'weights': {'ARC-c-test': ARC_weights[metric_name]} + } + ) + +summarizer = dict( type=CircularSummarizer, metric_types=['accuracy - clean', 'accuracy - input contaminated', 'accuracy - input-and-label contaminated'], - dataset_abbrs = [ - 'ceval-computer_network', - 'ceval-operating_system', - 'ceval-computer_architecture', - 'ceval-college_programming', - 'ceval-college_physics', - 'ceval-college_chemistry', - 'ceval-advanced_mathematics', - 'ceval-probability_and_statistics', - 'ceval-discrete_mathematics', - 'ceval-electrical_engineer', - 'ceval-metrology_engineer', - 'ceval-high_school_mathematics', - 'ceval-high_school_physics', - 'ceval-high_school_chemistry', - 'ceval-high_school_biology', - 'ceval-middle_school_mathematics', - 'ceval-middle_school_biology', - 'ceval-middle_school_physics', - 'ceval-middle_school_chemistry', - 'ceval-veterinary_medicine', - 'ceval-college_economics', - 'ceval-business_administration', - 'ceval-marxism', - 'ceval-mao_zedong_thought', - 'ceval-education_science', - 'ceval-teacher_qualification', - 'ceval-high_school_politics', - 'ceval-high_school_geography', - 'ceval-middle_school_politics', - 'ceval-middle_school_geography', - 'ceval-modern_chinese_history', - 'ceval-ideological_and_moral_cultivation', - 'ceval-logic', - 'ceval-law', - 'ceval-chinese_language_and_literature', - 'ceval-art_studies', - 'ceval-professional_tour_guide', - 'ceval-legal_professional', - 'ceval-high_school_chinese', - 'ceval-high_school_history', - 'ceval-middle_school_history', - 'ceval-civil_servant', - 'ceval-sports_science', - 'ceval-plant_protection', - 'ceval-basic_medicine', - 'ceval-clinical_medicine', - 'ceval-urban_and_rural_planner', - 'ceval-accountant', - 'ceval-fire_engineer', - 'ceval-environmental_impact_assessment_engineer', - 'ceval-tax_accountant', - 'ceval-physician', - 'ceval-humanities', - 'ceval-stem', - 'ceval-social-science', - 'ceval-other', - 'ceval-hard', - 'ceval', - ], - summary_groups=ceval_summary_groups, + dataset_abbrs = ['ceval', 'ceval-stem', 'ceval-social-science', 'ceval-humanities', 'ceval-other', 'ceval-hard', 'mmlu', 'mmlu-humanities', 'mmlu-stem', 'mmlu-social-science', 'mmlu-other', 'hellaswag', 'ARC-c-test'], + summary_groups=summary_groups, ) diff --git a/opencompass/datasets/arc.py b/opencompass/datasets/arc.py index 1cf50f11..9b512114 100644 --- a/opencompass/datasets/arc.py +++ b/opencompass/datasets/arc.py @@ -1,4 +1,5 @@ import json +import os.path as osp from datasets import Dataset @@ -30,3 +31,54 @@ class ARCDataset(BaseDataset): 'textD': question['choices'][3]['text'], }) return Dataset.from_list(rows) + + +class ARCDatasetClean(BaseDataset): + + # load the contamination annotations of CEval from + # https://github.com/liyucheng09/Contamination_Detector + @staticmethod + def load_contamination_annotations(path, split='val'): + import requests + + assert split == 'test', 'We only have test set annotation for ARC' + annotation_cache_path = osp.join( + path, f'ARC_c_{split}_contamination_annotations.json') + if osp.exists(annotation_cache_path): + with open(annotation_cache_path, 'r') as f: + annotations = json.load(f) + return annotations + link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc/ARC_annotations.json' # noqa + annotations = json.loads(requests.get(link_of_annotations).text) + with open(annotation_cache_path, 'w') as f: + json.dump(annotations, f) + return annotations + + @staticmethod + def load(path: str): + annotations = ARCDatasetClean.load_contamination_annotations( + osp.dirname(path), 'test') + with open(path, 'r', errors='ignore') as in_f: + rows = [] + for line in in_f: + item = json.loads(line.strip()) + id_ = item['id'] + question = item['question'] + if id_ in annotations: + is_clean = annotations[id_][0] + else: + is_clean = 'not labeled' + if len(question['choices']) != 4: + continue + labels = [c['label'] for c in question['choices']] + answerKey = 'ABCD'[labels.index(item['answerKey'])] + rows.append({ + 'question': question['stem'], + 'answerKey': answerKey, + 'textA': question['choices'][0]['text'], + 'textB': question['choices'][1]['text'], + 'textC': question['choices'][2]['text'], + 'textD': question['choices'][3]['text'], + 'is_clean': is_clean, + }) + return Dataset.from_list(rows) diff --git a/opencompass/datasets/hellaswag.py b/opencompass/datasets/hellaswag.py index d615ea9e..f1879e50 100644 --- a/opencompass/datasets/hellaswag.py +++ b/opencompass/datasets/hellaswag.py @@ -1,4 +1,5 @@ import json +import os.path as osp from datasets import Dataset @@ -68,3 +69,50 @@ class hellaswagDataset_V3(BaseDataset): }) dataset = Dataset.from_list(dataset) return dataset + + +class hellaswagDatasetClean(BaseDataset): + + # load the contamination annotations of CEval from + # https://github.com/liyucheng09/Contamination_Detector + @staticmethod + def load_contamination_annotations(path, split='val'): + import requests + + assert split == 'val', 'We only use val set of hellaswag' + annotation_cache_path = osp.join( + path, f'hellaswag_{split}_contamination_annotations.json') + if osp.exists(annotation_cache_path): + with open(annotation_cache_path, 'r') as f: + annotations = json.load(f) + return annotations + link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc2/hellaswag_annotations_with_line_index.json' # noqa + annotations = json.loads(requests.get(link_of_annotations).text) + with open(annotation_cache_path, 'w') as f: + json.dump(annotations, f) + return annotations + + @staticmethod + def load(path): + dataset = [] + annotations = hellaswagDatasetClean.load_contamination_annotations( + osp.dirname(path)) + with open(path, 'r', encoding='utf-8') as f: + for rwo_index, line in enumerate(f): + data = json.loads(line) + rwo_index = f'{rwo_index}' + if rwo_index in annotations: + is_clean = annotations[rwo_index][0] + else: + is_clean = 'not labeled' + dataset.append({ + 'ctx': data['query'].split(': ', 2)[-1], + 'A': data['choices'][0], + 'B': data['choices'][1], + 'C': data['choices'][2], + 'D': data['choices'][3], + 'label': data['gold'], + 'is_clean': is_clean, + }) + dataset = Dataset.from_list(dataset) + return dataset diff --git a/opencompass/datasets/mmlu.py b/opencompass/datasets/mmlu.py index ee0fc57e..76cd57f2 100644 --- a/opencompass/datasets/mmlu.py +++ b/opencompass/datasets/mmlu.py @@ -1,4 +1,5 @@ import csv +import json import os.path as osp from datasets import Dataset, DatasetDict @@ -31,3 +32,57 @@ class MMLUDataset(BaseDataset): }) dataset[split] = Dataset.from_list(raw_data) return dataset + + +class MMLUDatasetClean(BaseDataset): + + # load the contamination annotations of CEval from + # https://github.com/liyucheng09/Contamination_Detector + @staticmethod + def load_contamination_annotations(path, split='val'): + import requests + + assert split == 'test', 'We only use test set for MMLU' + annotation_cache_path = osp.join( + path, split, f'MMLU_{split}_contamination_annotations.json') + if osp.exists(annotation_cache_path): + with open(annotation_cache_path, 'r') as f: + annotations = json.load(f) + return annotations + link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc2/mmlu_annotations.json' # noqa + annotations = json.loads(requests.get(link_of_annotations).text) + with open(annotation_cache_path, 'w') as f: + json.dump(annotations, f) + return annotations + + @staticmethod + def load(path: str, name: str): + dataset = DatasetDict() + for split in ['dev', 'test']: + raw_data = [] + filename = osp.join(path, split, f'{name}_{split}.csv') + if split == 'test': + annotations = MMLUDatasetClean.load_contamination_annotations( + path, split) + with open(filename, encoding='utf-8') as f: + reader = csv.reader(f) + for row_index, row in enumerate(reader): + assert len(row) == 6 + item = { + 'input': row[0], + 'A': row[1], + 'B': row[2], + 'C': row[3], + 'D': row[4], + 'target': row[5], + } + if split == 'test': + row_id = f'{name} {row_index}' + if row_id in annotations: + is_clean = annotations[row_id][0] + else: + is_clean = 'not labeled' + item['is_clean'] = is_clean + raw_data.append(item) + dataset[split] = Dataset.from_list(raw_data) + return dataset