2023-07-04 21:34:55 +08:00
|
|
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
|
|
from opencompass.openicl.icl_retriever import TopkRetriever
|
|
|
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
|
|
from opencompass.openicl.icl_evaluator import BleuEvaluator
|
|
|
|
from opencompass.datasets import FloresFirst100Dataset
|
|
|
|
|
|
|
|
_flores_lang_map = [
|
2024-05-14 15:35:58 +08:00
|
|
|
['eng', 'eng_Latn', 'English', 'Indo-European-Germanic'],
|
|
|
|
['afr', 'afr_Latn', 'Afrikaans', 'Indo-European-Germanic'],
|
|
|
|
['dan', 'dan_Latn', 'Danish', 'Indo-European-Germanic'],
|
|
|
|
['deu', 'deu_Latn', 'German', 'Indo-European-Germanic'],
|
|
|
|
['isl', 'isl_Latn', 'Icelandic', 'Indo-European-Germanic'],
|
|
|
|
['ltz', 'ltz_Latn', 'Luxembourgish', 'Indo-European-Germanic'],
|
|
|
|
['nld', 'nld_Latn', 'Dutch', 'Indo-European-Germanic'],
|
|
|
|
['nob', 'nob_Latn', 'Norwegian', 'Indo-European-Germanic'],
|
|
|
|
['swe', 'swe_Latn', 'Swedish', 'Indo-European-Germanic'],
|
|
|
|
['ast', 'ast_Latn', 'Asturian', 'Indo-European-Romance'],
|
|
|
|
['cat', 'cat_Latn', 'Catalan', 'Indo-European-Romance'],
|
|
|
|
['fra', 'fra_Latn', 'French', 'Indo-European-Romance'],
|
|
|
|
['glg', 'glg_Latn', 'Galician', 'Indo-European-Romance'],
|
|
|
|
['oci', 'oci_Latn', 'Occitan', 'Indo-European-Romance'],
|
|
|
|
['por', 'por_Latn', 'Portuguese', 'Indo-European-Romance'],
|
|
|
|
['ron', 'ron_Latn', 'Romanian', 'Indo-European-Romance'],
|
|
|
|
['spa', 'spa_Latn', 'Spanish', 'Indo-European-Romance'],
|
|
|
|
['bel', 'bel_Cyrl', 'Belarusian', 'Indo-European-Slavic'],
|
|
|
|
['bos', 'bos_Latn', 'Bosnian', 'Indo-European-Slavic'],
|
|
|
|
['bul', 'bul_Cyrl', 'Bulgarian', 'Indo-European-Slavic'],
|
|
|
|
['ces', 'ces_Latn', 'Czech', 'Indo-European-Slavic'],
|
|
|
|
['hrv', 'hrv_Latn', 'Croatian', 'Indo-European-Slavic'],
|
|
|
|
['mkd', 'mkd_Cyrl', 'Macedonian', 'Indo-European-Slavic'],
|
|
|
|
['pol', 'pol_Latn', 'Polish', 'Indo-European-Slavic'],
|
|
|
|
['rus', 'rus_Cyrl', 'Russian', 'Indo-European-Slavic'],
|
|
|
|
['slk', 'slk_Latn', 'Slovak', 'Indo-European-Slavic'],
|
|
|
|
['slv', 'slv_Latn', 'Slovenian', 'Indo-European-Slavic'],
|
|
|
|
['srp', 'srp_Cyrl', 'Serbian', 'Indo-European-Slavic'],
|
|
|
|
['ukr', 'ukr_Cyrl', 'Ukrainian', 'Indo-European-Slavic'],
|
|
|
|
['asm', 'asm_Beng', 'Assamese', 'Indo-European-Indo-Aryan'],
|
|
|
|
['ben', 'ben_Beng', 'Bengali', 'Indo-European-Indo-Aryan'],
|
|
|
|
['guj', 'guj_Gujr', 'Gujarati', 'Indo-European-Indo-Aryan'],
|
|
|
|
['hin', 'hin_Deva', 'Hindi', 'Indo-European-Indo-Aryan'],
|
|
|
|
['mar', 'mar_Deva', 'Marathi', 'Indo-European-Indo-Aryan'],
|
|
|
|
['npi', 'npi_Deva', 'Nepali', 'Indo-European-Indo-Aryan'],
|
|
|
|
['ory', 'ory_Orya', 'Oriya', 'Indo-European-Indo-Aryan'],
|
|
|
|
['pan', 'pan_Guru', 'Punjabi', 'Indo-European-Indo-Aryan'],
|
|
|
|
['snd', 'snd_Arab', 'Sindhi', 'Indo-European-Indo-Aryan'],
|
|
|
|
['urd', 'urd_Arab', 'Urdu', 'Indo-European-Indo-Aryan'],
|
|
|
|
['ckb', 'ckb_Arab', 'Kurdish', 'Indo-European-Other'],
|
|
|
|
['cym', 'cym_Latn', 'Welsh', 'Indo-European-Other'],
|
|
|
|
['ell', 'ell_Grek', 'Greek', 'Indo-European-Other'],
|
|
|
|
['fas', 'pes_Arab', 'Persian', 'Indo-European-Other'],
|
|
|
|
['gle', 'gle_Latn', 'Irish', 'Indo-European-Other'],
|
|
|
|
['hye', 'hye_Armn', 'Armenian', 'Indo-European-Other'],
|
|
|
|
['ita', 'ita_Latn', 'Italian', 'Indo-European-Other'],
|
|
|
|
['lav', 'lvs_Latn', 'Latvian', 'Indo-European-Other'],
|
|
|
|
['lit', 'lit_Latn', 'Lithuanian', 'Indo-European-Other'],
|
|
|
|
['pus', 'pbt_Arab', 'Pashto', 'Indo-European-Other'],
|
|
|
|
['tgk', 'tgk_Cyrl', 'Tajik', 'Indo-European-Other'],
|
|
|
|
['ceb', 'ceb_Latn', 'Cebuano', 'Austronesian'],
|
|
|
|
['ind', 'ind_Latn', 'Indonesian', 'Austronesian'],
|
|
|
|
['jav', 'jav_Latn', 'Javanese', 'Austronesian'],
|
|
|
|
['mri', 'mri_Latn', 'Maori', 'Austronesian'],
|
|
|
|
['msa', 'zsm_Latn', 'Malay', 'Austronesian'],
|
|
|
|
['tgl', 'tgl_Latn', 'Tagalog', 'Austronesian'],
|
|
|
|
['ibo', 'ibo_Latn', 'Igbo', 'Atlantic-Congo'],
|
|
|
|
['kam', 'kam_Latn', 'Kamba', 'Atlantic-Congo'],
|
|
|
|
['kea', 'kea_Latn', 'Kabuverdianu', 'Atlantic-Congo'],
|
|
|
|
['lin', 'lin_Latn', 'Lingala', 'Atlantic-Congo'],
|
|
|
|
['lug', 'lug_Latn', 'Luganda', 'Atlantic-Congo'],
|
|
|
|
['nso', 'nso_Latn', 'Northern Sotho', 'Atlantic-Congo'],
|
|
|
|
['nya', 'nya_Latn', 'Nyanja', 'Atlantic-Congo'],
|
|
|
|
['sna', 'sna_Latn', 'Shona', 'Atlantic-Congo'],
|
|
|
|
['swh', 'swh_Latn', 'Swahili', 'Atlantic-Congo'],
|
|
|
|
['umb', 'umb_Latn', 'Umbundu', 'Atlantic-Congo'],
|
|
|
|
['wol', 'wol_Latn', 'Wolof', 'Atlantic-Congo'],
|
|
|
|
['xho', 'xho_Latn', 'Xhosa', 'Atlantic-Congo'],
|
|
|
|
['yor', 'yor_Latn', 'Yoruba', 'Atlantic-Congo'],
|
|
|
|
['zul', 'zul_Latn', 'Zulu', 'Atlantic-Congo'],
|
|
|
|
['amh', 'amh_Ethi', 'Amharic', 'Afro-Asiatic'],
|
|
|
|
['ara', 'arb_Arab', 'Arabic', 'Afro-Asiatic'],
|
|
|
|
['ful', 'fuv_Latn', 'Fulah', 'Afro-Asiatic'],
|
|
|
|
['mlt', 'mlt_Latn', 'Maltese', 'Afro-Asiatic'],
|
|
|
|
['orm', 'gaz_Latn', 'Oromo', 'Afro-Asiatic'],
|
|
|
|
['som', 'som_Latn', 'Somali', 'Afro-Asiatic'],
|
|
|
|
['azj', 'azj_Latn', 'Azerbaijani', 'Turkic'],
|
|
|
|
['kaz', 'kaz_Cyrl', 'Kazakh', 'Turkic'],
|
|
|
|
['kir', 'kir_Cyrl', 'Kyrgyz', 'Turkic'],
|
|
|
|
['tur', 'tur_Latn', 'Turkish', 'Turkic'],
|
|
|
|
['uzb', 'uzn_Latn', 'Uzbek', 'Turkic'],
|
|
|
|
['kan', 'kan_Knda', 'Kannada', 'Dravidian'],
|
|
|
|
['mal', 'mal_Mlym', 'Malayalam', 'Dravidian'],
|
|
|
|
['tam', 'tam_Taml', 'Tamil', 'Dravidian'],
|
|
|
|
['tel', 'tel_Telu', 'Telugu', 'Dravidian'],
|
|
|
|
['mya', 'mya_Mymr', 'Burmese', 'Sino-Tibetan'],
|
|
|
|
['zho_simpl', 'zho_Hans', 'Chinese (Simpl)', 'Sino-Tibetan'],
|
|
|
|
['zho_trad', 'zho_Hant', 'Chinese (Trad)', 'Sino-Tibetan'],
|
|
|
|
['est', 'est_Latn', 'Estonian', 'Other'],
|
|
|
|
['fin', 'fin_Latn', 'Finnish', 'Other'],
|
|
|
|
['hau', 'hau_Latn', 'Hausa', 'Other'],
|
|
|
|
['heb', 'heb_Hebr', 'Hebrew', 'Other'],
|
|
|
|
['hun', 'hun_Latn', 'Hungarian', 'Other'],
|
|
|
|
['jpn', 'jpn_Jpan', 'Japanese', 'Other'],
|
|
|
|
['kat', 'kat_Geor', 'Georgian', 'Other'],
|
|
|
|
['khm', 'khm_Khmr', 'Khmer', 'Other'],
|
|
|
|
['kor', 'kor_Hang', 'Korean', 'Other'],
|
|
|
|
['lao', 'lao_Laoo', 'Lao', 'Other'],
|
|
|
|
['luo', 'luo_Latn', 'Luo', 'Other'],
|
|
|
|
['mon', 'khk_Cyrl', 'Mongolian', 'Other'],
|
|
|
|
['tha', 'tha_Thai', 'Thai', 'Other'],
|
|
|
|
['vie', 'vie_Latn', 'Vietnamese', 'Other'],
|
2023-07-04 21:34:55 +08:00
|
|
|
]
|
|
|
|
flores_lang_map = {i[0]: i for i in _flores_lang_map}
|
2024-05-14 15:35:58 +08:00
|
|
|
_flores_subtasks = [f'eng-{i}' for i in flores_lang_map if i != 'eng'
|
|
|
|
] + [f'{i}-eng' for i in flores_lang_map if i != 'eng']
|
2023-07-04 21:34:55 +08:00
|
|
|
|
|
|
|
flores_datasets = []
|
|
|
|
for _flores_subtask in _flores_subtasks:
|
2024-05-14 15:35:58 +08:00
|
|
|
_src, _tgt = _flores_subtask.split('-')
|
2023-07-04 21:34:55 +08:00
|
|
|
_, _flores_source, _src_inst, _ = flores_lang_map[_src]
|
|
|
|
_, _flores_target, _tgt_inst, _ = flores_lang_map[_tgt]
|
|
|
|
|
2023-11-13 13:00:37 +08:00
|
|
|
flores_reader_cfg = dict(
|
2024-05-14 15:35:58 +08:00
|
|
|
input_columns=f'sentence_{_flores_source}',
|
|
|
|
output_column=f'sentence_{_flores_target}',
|
|
|
|
train_split='dev',
|
|
|
|
test_split='devtest'
|
2023-11-13 13:00:37 +08:00
|
|
|
)
|
2023-07-04 21:34:55 +08:00
|
|
|
flores_infer_cfg = dict(
|
|
|
|
ice_template=dict(
|
|
|
|
type=PromptTemplate,
|
2024-05-14 15:35:58 +08:00
|
|
|
template=f'</E>{{sentence_{_flores_source}}} = {{sentence_{_flores_target}}}' if _flores_subtask != 'zho_simpl-eng'
|
|
|
|
else f'</E>Chinese: {{sentence_{_flores_source}}}\nEnglish: {{sentence_{_flores_target}}}',
|
|
|
|
ice_token='</E>',
|
2023-07-04 21:34:55 +08:00
|
|
|
),
|
|
|
|
retriever=dict(type=TopkRetriever, ice_num=8),
|
|
|
|
inferencer=dict(type=GenInferencer),
|
|
|
|
)
|
|
|
|
flores_eval_cfg = dict(
|
|
|
|
evaluator=dict(type=BleuEvaluator),
|
2024-05-14 15:35:58 +08:00
|
|
|
pred_role='BOT',
|
|
|
|
pred_postprocessor=dict(type='flores'),
|
|
|
|
dataset_postprocessor=dict(type='flores'),
|
2023-07-04 21:34:55 +08:00
|
|
|
)
|
2024-05-14 15:35:58 +08:00
|
|
|
if _tgt == 'zho_simpl':
|
|
|
|
flores_eval_cfg['pred_postprocessor'] = dict(type='flores-chinese')
|
|
|
|
flores_eval_cfg['dataset_postprocessor'] = dict(type='flores-chinese')
|
2023-07-04 21:34:55 +08:00
|
|
|
flores_datasets.append(
|
|
|
|
dict(
|
2024-05-14 15:35:58 +08:00
|
|
|
abbr=f'flores_100_{_src}-{_tgt}',
|
2023-11-13 13:00:37 +08:00
|
|
|
type=FloresFirst100Dataset,
|
[Feature] Support ModelScope datasets (#1289)
* add ceval, gsm8k modelscope surpport
* update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest
* update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets
* format file
* format file
* update dataset format
* support ms_dataset
* udpate dataset for modelscope support
* merge myl_dev and update test_ms_dataset
* udpate dataset for modelscope support
* update readme
* update eval_api_zhipu_v2
* remove unused code
* add get_data_path function
* update readme
* remove tydiqa japanese subset
* add ceval, gsm8k modelscope surpport
* update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest
* update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets
* format file
* format file
* update dataset format
* support ms_dataset
* udpate dataset for modelscope support
* merge myl_dev and update test_ms_dataset
* update readme
* udpate dataset for modelscope support
* update eval_api_zhipu_v2
* remove unused code
* add get_data_path function
* remove tydiqa japanese subset
* update util
* remove .DS_Store
* fix md format
* move util into package
* update docs/get_started.md
* restore eval_api_zhipu_v2.py, add environment setting
* Update dataset
* Update
* Update
* Update
* Update
---------
Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local>
Co-authored-by: Yunnglin <mao.looper@qq.com>
Co-authored-by: Yun lin <yunlin@laptop.local>
Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn>
Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
2024-07-29 13:48:32 +08:00
|
|
|
path='opencompass/flores',
|
2024-05-14 15:35:58 +08:00
|
|
|
name=f'{_flores_source}-{_flores_target}',
|
2023-11-13 13:00:37 +08:00
|
|
|
reader_cfg=flores_reader_cfg.copy(),
|
2023-07-04 21:34:55 +08:00
|
|
|
infer_cfg=flores_infer_cfg.copy(),
|
|
|
|
eval_cfg=flores_eval_cfg.copy(),
|
|
|
|
))
|