[Feature] Add AceGPT-MMLUArabic benchmark (#1099)

* add AceGPT-MMLUArabic benchmark * update readme and fix lint issue * remove unused package * add MMLUArabic zero-shot settings * rename filename and update readme
2025-05-30 16:03:24 +08:00 · 2024-05-08 15:00:26 +08:00 · 2024-05-08 15:00:26 +08:00 · d2c40e5648
commit d2c40e5648
parent 862044fb7d
10 changed files with 285 additions and 0 deletions
--- a/configs/datasets/MMLUArabic/MMLUArabic_gen.py
+++ b/configs/datasets/MMLUArabic/MMLUArabic_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .MMLUArabic_gen_326684 import MMLUArabic_datasets  # noqa: F401, F403
--- a/configs/datasets/MMLUArabic/MMLUArabic_gen_326684.py
+++ b/configs/datasets/MMLUArabic/MMLUArabic_gen_326684.py
@ -0,0 +1,59 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import MMLUArabicDataset
 from opencompass.utils.text_postprocessors import first_option_postprocess
 # None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
 # Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic
 MMLUArabic_reader_cfg = dict(
    input_columns=["input", "A", "B", "C", "D"],
    output_column="target",
    train_split='dev')
 MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
 MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
 MMLUArabic_datasets = []
 for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
    _system = f"فيما يلي أسئلة الاختيار من متعدد (مع الإجابات) حول {' '.join(_name_ar.split('_'))}"
    _hint = "\n{input}"
    MMLUArabic_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(role='SYSTEM', fallback_role='HUMAN', prompt=_system),
                    '</E>',
                ],
                round=[
                    dict(
                        role="HUMAN",
                        prompt=_hint.format(input="سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}")
                    ),
                    dict(role="BOT", prompt="إجابة: {target}")
                ]),
            ice_token="</E>",
        ),
        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
        inferencer=dict(type=GenInferencer),
    )
    MMLUArabic_eval_cfg = dict(
        evaluator=dict(type=AccEvaluator),
        pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
    MMLUArabic_datasets.append(
        dict(
            abbr=f"acegpt_MMLUArabic_{_name}",
            type=MMLUArabicDataset,
            path="./data/MMLUArabic/",
            name=_name,
            reader_cfg=MMLUArabic_reader_cfg,
            infer_cfg=MMLUArabic_infer_cfg,
            eval_cfg=MMLUArabic_eval_cfg,
        ))
 del _name, _hint
--- a/configs/datasets/MMLUArabic/MMLUArabic_ppl.py
+++ b/configs/datasets/MMLUArabic/MMLUArabic_ppl.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .MMLUArabic_ppl_d2333a import MMLUArabic_datasets  # noqa: F401, F403
--- a/configs/datasets/MMLUArabic/MMLUArabic_ppl_d2333a.py
+++ b/configs/datasets/MMLUArabic/MMLUArabic_ppl_d2333a.py
@ -0,0 +1,51 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import MMLUArabicDataset
 # None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
 # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
 MMLUArabic_reader_cfg = dict(
    input_columns=["input", "A", "B", "C", "D"],
    output_column="target",
    train_split='dev')
 MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
 MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
 MMLUArabic_datasets = []
 for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
    # _hint = f'The following are multiple choice questions (with answers) about  {_name.replace("_", " ")}.\n\n'
    _hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n"
    # question_overall = '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
    question_overall = "سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
    MMLUArabic_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template={opt: f"{question_overall}\nإجابة: {opt}\n" for opt in ["A", "B", "C", "D"]},
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template={opt: f"{_hint}</E>{question_overall}\nإجابة: {opt}" for opt in ["A", "B", "C", "D"]},
            ice_token="</E>",
        ),
        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
        inferencer=dict(type=PPLInferencer),
    )
    MMLUArabic_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
    MMLUArabic_datasets.append(
        dict(
            abbr=f"acegpt_MMLUArabic_{_name}",
            type=MMLUArabicDataset,
            path="./data/MMLUArabic/",
            name=_name,
            reader_cfg=MMLUArabic_reader_cfg,
            infer_cfg=MMLUArabic_infer_cfg,
            eval_cfg=MMLUArabic_eval_cfg,
        ))
 del _name, _hint
--- a/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen.py
+++ b/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .MMLUArabic_zero_shot_gen_3523e0 import MMLUArabic_datasets  # noqa: F401, F403
--- a/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen_3523e0.py
+++ b/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen_3523e0.py
@ -0,0 +1,53 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import MMLUArabicDataset
 from opencompass.utils.text_postprocessors import first_option_postprocess
 # None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
 # Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic
 MMLUArabic_reader_cfg = dict(
    input_columns=["input", "A", "B", "C", "D"],
    output_column="target",
    train_split='dev')
 MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
 MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
 MMLUArabic_datasets = []
 for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
    _hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n" + "{input}\n" + "من فضلك اختر إجابة واحدة من بين 'A، B، C، D' دون شرح."
    MMLUArabic_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(
                round=[
                    dict(
                        role="HUMAN",
                        prompt=_hint.format(input="سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}")
                    ),
                ]),
            ice_token="</E>",
        ),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer),
    )
    MMLUArabic_eval_cfg = dict(
        evaluator=dict(type=AccEvaluator),
        pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
    MMLUArabic_datasets.append(
        dict(
            abbr=f"acegpt_MMLUArabic_{_name}",
            type=MMLUArabicDataset,
            path="./data/MMLUArabic/",
            name=_name,
            reader_cfg=MMLUArabic_reader_cfg,
            infer_cfg=MMLUArabic_infer_cfg,
            eval_cfg=MMLUArabic_eval_cfg,
        ))
 del _name, _hint
--- a/configs/datasets/MMLUArabic/README.md
+++ b/configs/datasets/MMLUArabic/README.md
@ -0,0 +1,26 @@
 # MMLUArabic
 ## Dataset Description
 MMLUArabic is a benchmark for the assessment of knowledge in Arabic and covers a wide range of topics and aspects, consisting of multiple-choice questions in various branches of knowledge.
 ## How to Use
 Download file from [link](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic)
 ```python
 val_ds = load_dataset("MMLUArabic", header=None)['validation']
 test_ds = load_dataset("MMLUArabic", header=None)['test']
 # input, option_a, option_b, option_c, option_d, target
 print(next(iter(val_ds))) 
 ```
 ## Citation
 ```
@misc{huang2023acegpt,
      title={AceGPT, Localizing Large Language Models in Arabic}, 
      author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu},
      year={2023},
      eprint={2309.12053},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
 }
 ```
--- a/configs/summarizers/groups/MMLUArabic.py
+++ b/configs/summarizers/groups/MMLUArabic.py
@ -0,0 +1,50 @@
 sub_categories = {
    'math': ['abstract_algebra', 'college_mathematics', 'elementary_mathematics', 'high_school_mathematics', 'high_school_statistics'], 
    'health': ['anatomy', 'clinical_knowledge', 'college_medicine', 'human_aging', 'medical_genetics', 'nutrition', 'professional_medicine', 'virology'], 
    'physics': ['astronomy', 'college_physics', 'conceptual_physics', 'high_school_physics'], 
    'business': ['business_ethics', 'management', 'marketing'], 
    'biology': ['college_biology', 'high_school_biology'], 
    'chemistry': ['college_chemistry', 'high_school_chemistry'], 
    'computer science': ['college_computer_science', 'computer_security', 'high_school_computer_science', 'machine_learning'], 
    'economics': ['econometrics', 'high_school_macroeconomics', 'high_school_microeconomics'], 
    'engineering': ['electrical_engineering'], 
    'philosophy': ['formal_logic', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'world_religions'], 
    'other': ['global_facts', 'miscellaneous', 'professional_accounting'], 
    'history': ['high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'prehistory'], 
    'geography': ['high_school_geography'], 
    'politics': ['high_school_government_and_politics', 'public_relations', 'security_studies', 'us_foreign_policy'], 
    'psychology': ['high_school_psychology', 'professional_psychology'], 
    'culture': ['human_sexuality', 'sociology'], 
    'law': ['international_law', 'jurisprudence', 'professional_law']
 }
 categories = {
    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
    "humanities": ["history", "philosophy", "law"],
    "social_sciences": ["politics", "culture", "economics", "geography", "psychology"],
    "other": ["other", "business", "health"],
 }
 category2subject = {}
 for k, v in categories.items():
    for subject, subcat in sub_categories.items():
        if subject in v:
            for c in subcat:
                category2subject.setdefault(k, []).append(c)
 MMLUArabic_summary_groups = []
 _MMLUArabic_stem = ['acegpt_MMLUArabic_' + s for s in category2subject['STEM']]
 MMLUArabic_summary_groups.append({'name': 'acegpt_MMLUArabic_STEM', 'subsets': _MMLUArabic_stem})
 _MMLUArabic_humanities = ['acegpt_MMLUArabic_' + s for s in category2subject['humanities']]
 MMLUArabic_summary_groups.append({'name': 'acegpt_MMLUArabic_humanities', 'subsets': _MMLUArabic_humanities})
 _MMLUArabic_social_science = ['acegpt_MMLUArabic_' + s for s in category2subject['social_sciences']]
 MMLUArabic_summary_groups.append({'name': 'acegpt_MMLUArabic_social_science', 'subsets': _MMLUArabic_social_science})
 _MMLUArabic_other = ['acegpt_MMLUArabic_' + s for s in category2subject['other']]
 MMLUArabic_summary_groups.append({'name': 'acegpt_MMLUArabic_other', 'subsets': _MMLUArabic_other})
 _MMLUArabic_all = _MMLUArabic_stem + _MMLUArabic_humanities + _MMLUArabic_social_science + _MMLUArabic_other
 MMLUArabic_summary_groups.append({'name': 'acegpt_MMLUArabic', 'subsets': _MMLUArabic_all})
--- a/opencompass/datasets/MMLUArabic.py
+++ b/opencompass/datasets/MMLUArabic.py
@ -0,0 +1,33 @@
 import csv
 import os.path as osp
 from datasets import Dataset, DatasetDict
 from opencompass.registry import LOAD_DATASET
 from .base import BaseDataset
@LOAD_DATASET.register_module()
 class MMLUArabicDataset(BaseDataset):
    @staticmethod
    def load(path: str, name: str):
        dataset = DatasetDict()
        for split in ['dev', 'test']:
            raw_data = []
            filename = osp.join(path, split, f'{name}_{split}.csv')
            with open(filename, encoding='utf-8') as f:
                reader = csv.reader(f)
                for row in reader:
                    assert len(row) == 6
                    raw_data.append({
                        'input': row[0],
                        'A': row[1],
                        'B': row[2],
                        'C': row[3],
                        'D': row[4],
                        'target': row[5],
                    })
            dataset[split] = Dataset.from_list(raw_data)
        return dataset
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -73,6 +73,7 @@ from .mbpp import *  # noqa: F401, F403
 from .medbench import *  # noqa: F401, F403
 from .mgsm import *  # noqa: F401, F403
 from .mmlu import *  # noqa: F401, F403
 from .MMLUArabic import * # noqa: F401, F403
 from .multirc import *  # noqa: F401, F403
 from .narrativeqa import *  # noqa: F401, F403
 from .natural_question import *  # noqa: F401, F403