[Feature] Add AceGPT-MMLUArabic benchmark (#1099)

* add AceGPT-MMLUArabic benchmark

* update readme and fix lint issue

* remove unused package

* add MMLUArabic zero-shot settings

* rename filename and update readme
This commit is contained in:
JuhaoLiang 2024-05-08 15:00:26 +08:00 committed by GitHub
parent 862044fb7d
commit d2c40e5648
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 285 additions and 0 deletions

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .MMLUArabic_gen_326684 import MMLUArabic_datasets # noqa: F401, F403

View File

@ -0,0 +1,59 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import MMLUArabicDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
# None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
# Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic
MMLUArabic_reader_cfg = dict(
input_columns=["input", "A", "B", "C", "D"],
output_column="target",
train_split='dev')
MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
MMLUArabic_datasets = []
for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
_system = f"فيما يلي أسئلة الاختيار من متعدد (مع الإجابات) حول {' '.join(_name_ar.split('_'))}"
_hint = "\n{input}"
MMLUArabic_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt=_system),
'</E>',
],
round=[
dict(
role="HUMAN",
prompt=_hint.format(input="سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}")
),
dict(role="BOT", prompt="إجابة: {target}")
]),
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer),
)
MMLUArabic_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
MMLUArabic_datasets.append(
dict(
abbr=f"acegpt_MMLUArabic_{_name}",
type=MMLUArabicDataset,
path="./data/MMLUArabic/",
name=_name,
reader_cfg=MMLUArabic_reader_cfg,
infer_cfg=MMLUArabic_infer_cfg,
eval_cfg=MMLUArabic_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .MMLUArabic_ppl_d2333a import MMLUArabic_datasets # noqa: F401, F403

View File

@ -0,0 +1,51 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import MMLUArabicDataset
# None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
MMLUArabic_reader_cfg = dict(
input_columns=["input", "A", "B", "C", "D"],
output_column="target",
train_split='dev')
MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
MMLUArabic_datasets = []
for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
# _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n'
_hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n"
# question_overall = '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
question_overall = "سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
MMLUArabic_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={opt: f"{question_overall}\nإجابة: {opt}\n" for opt in ["A", "B", "C", "D"]},
),
prompt_template=dict(
type=PromptTemplate,
template={opt: f"{_hint}</E>{question_overall}\nإجابة: {opt}" for opt in ["A", "B", "C", "D"]},
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer),
)
MMLUArabic_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
MMLUArabic_datasets.append(
dict(
abbr=f"acegpt_MMLUArabic_{_name}",
type=MMLUArabicDataset,
path="./data/MMLUArabic/",
name=_name,
reader_cfg=MMLUArabic_reader_cfg,
infer_cfg=MMLUArabic_infer_cfg,
eval_cfg=MMLUArabic_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .MMLUArabic_zero_shot_gen_3523e0 import MMLUArabic_datasets # noqa: F401, F403

View File

@ -0,0 +1,53 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import MMLUArabicDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
# None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
# Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic
MMLUArabic_reader_cfg = dict(
input_columns=["input", "A", "B", "C", "D"],
output_column="target",
train_split='dev')
MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
MMLUArabic_datasets = []
for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
_hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n" + "{input}\n" + "من فضلك اختر إجابة واحدة من بين 'A، B، C، D' دون شرح."
MMLUArabic_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt=_hint.format(input="سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}")
),
]),
ice_token="</E>",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
MMLUArabic_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
MMLUArabic_datasets.append(
dict(
abbr=f"acegpt_MMLUArabic_{_name}",
type=MMLUArabicDataset,
path="./data/MMLUArabic/",
name=_name,
reader_cfg=MMLUArabic_reader_cfg,
infer_cfg=MMLUArabic_infer_cfg,
eval_cfg=MMLUArabic_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,26 @@
# MMLUArabic
## Dataset Description
MMLUArabic is a benchmark for the assessment of knowledge in Arabic and covers a wide range of topics and aspects, consisting of multiple-choice questions in various branches of knowledge.
## How to Use
Download file from [link](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic)
```python
val_ds = load_dataset("MMLUArabic", header=None)['validation']
test_ds = load_dataset("MMLUArabic", header=None)['test']
# input, option_a, option_b, option_c, option_d, target
print(next(iter(val_ds)))
```
## Citation
```
@misc{huang2023acegpt,
title={AceGPT, Localizing Large Language Models in Arabic},
author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu},
year={2023},
eprint={2309.12053},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```

View File

@ -0,0 +1,50 @@
sub_categories = {
'math': ['abstract_algebra', 'college_mathematics', 'elementary_mathematics', 'high_school_mathematics', 'high_school_statistics'],
'health': ['anatomy', 'clinical_knowledge', 'college_medicine', 'human_aging', 'medical_genetics', 'nutrition', 'professional_medicine', 'virology'],
'physics': ['astronomy', 'college_physics', 'conceptual_physics', 'high_school_physics'],
'business': ['business_ethics', 'management', 'marketing'],
'biology': ['college_biology', 'high_school_biology'],
'chemistry': ['college_chemistry', 'high_school_chemistry'],
'computer science': ['college_computer_science', 'computer_security', 'high_school_computer_science', 'machine_learning'],
'economics': ['econometrics', 'high_school_macroeconomics', 'high_school_microeconomics'],
'engineering': ['electrical_engineering'],
'philosophy': ['formal_logic', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'world_religions'],
'other': ['global_facts', 'miscellaneous', 'professional_accounting'],
'history': ['high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'prehistory'],
'geography': ['high_school_geography'],
'politics': ['high_school_government_and_politics', 'public_relations', 'security_studies', 'us_foreign_policy'],
'psychology': ['high_school_psychology', 'professional_psychology'],
'culture': ['human_sexuality', 'sociology'],
'law': ['international_law', 'jurisprudence', 'professional_law']
}
categories = {
"STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
"humanities": ["history", "philosophy", "law"],
"social_sciences": ["politics", "culture", "economics", "geography", "psychology"],
"other": ["other", "business", "health"],
}
category2subject = {}
for k, v in categories.items():
for subject, subcat in sub_categories.items():
if subject in v:
for c in subcat:
category2subject.setdefault(k, []).append(c)
MMLUArabic_summary_groups = []
_MMLUArabic_stem = ['acegpt_MMLUArabic_' + s for s in category2subject['STEM']]
MMLUArabic_summary_groups.append({'name': 'acegpt_MMLUArabic_STEM', 'subsets': _MMLUArabic_stem})
_MMLUArabic_humanities = ['acegpt_MMLUArabic_' + s for s in category2subject['humanities']]
MMLUArabic_summary_groups.append({'name': 'acegpt_MMLUArabic_humanities', 'subsets': _MMLUArabic_humanities})
_MMLUArabic_social_science = ['acegpt_MMLUArabic_' + s for s in category2subject['social_sciences']]
MMLUArabic_summary_groups.append({'name': 'acegpt_MMLUArabic_social_science', 'subsets': _MMLUArabic_social_science})
_MMLUArabic_other = ['acegpt_MMLUArabic_' + s for s in category2subject['other']]
MMLUArabic_summary_groups.append({'name': 'acegpt_MMLUArabic_other', 'subsets': _MMLUArabic_other})
_MMLUArabic_all = _MMLUArabic_stem + _MMLUArabic_humanities + _MMLUArabic_social_science + _MMLUArabic_other
MMLUArabic_summary_groups.append({'name': 'acegpt_MMLUArabic', 'subsets': _MMLUArabic_all})

View File

@ -0,0 +1,33 @@
import csv
import os.path as osp
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class MMLUArabicDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
dataset = DatasetDict()
for split in ['dev', 'test']:
raw_data = []
filename = osp.join(path, split, f'{name}_{split}.csv')
with open(filename, encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
assert len(row) == 6
raw_data.append({
'input': row[0],
'A': row[1],
'B': row[2],
'C': row[3],
'D': row[4],
'target': row[5],
})
dataset[split] = Dataset.from_list(raw_data)
return dataset

View File

@ -73,6 +73,7 @@ from .mbpp import * # noqa: F401, F403
from .medbench import * # noqa: F401, F403 from .medbench import * # noqa: F401, F403
from .mgsm import * # noqa: F401, F403 from .mgsm import * # noqa: F401, F403
from .mmlu import * # noqa: F401, F403 from .mmlu import * # noqa: F401, F403
from .MMLUArabic import * # noqa: F401, F403
from .multirc import * # noqa: F401, F403 from .multirc import * # noqa: F401, F403
from .narrativeqa import * # noqa: F401, F403 from .narrativeqa import * # noqa: F401, F403
from .natural_question import * # noqa: F401, F403 from .natural_question import * # noqa: F401, F403