OpenCompass/configs/datasets/flores/flores_gen_806ede.py

163 lines
7.7 KiB
Python
Raw Normal View History

2023-07-05 10:33:12 +08:00
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import TopkRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import BleuEvaluator
from opencompass.datasets import FloresFirst100Dataset
_flores_lang_map = [
2024-05-14 15:35:58 +08:00
['eng', 'eng_Latn', 'English', 'Indo-European-Germanic'],
['afr', 'afr_Latn', 'Afrikaans', 'Indo-European-Germanic'],
['dan', 'dan_Latn', 'Danish', 'Indo-European-Germanic'],
['deu', 'deu_Latn', 'German', 'Indo-European-Germanic'],
['isl', 'isl_Latn', 'Icelandic', 'Indo-European-Germanic'],
['ltz', 'ltz_Latn', 'Luxembourgish', 'Indo-European-Germanic'],
['nld', 'nld_Latn', 'Dutch', 'Indo-European-Germanic'],
['nob', 'nob_Latn', 'Norwegian', 'Indo-European-Germanic'],
['swe', 'swe_Latn', 'Swedish', 'Indo-European-Germanic'],
['ast', 'ast_Latn', 'Asturian', 'Indo-European-Romance'],
['cat', 'cat_Latn', 'Catalan', 'Indo-European-Romance'],
['fra', 'fra_Latn', 'French', 'Indo-European-Romance'],
['glg', 'glg_Latn', 'Galician', 'Indo-European-Romance'],
['oci', 'oci_Latn', 'Occitan', 'Indo-European-Romance'],
['por', 'por_Latn', 'Portuguese', 'Indo-European-Romance'],
['ron', 'ron_Latn', 'Romanian', 'Indo-European-Romance'],
['spa', 'spa_Latn', 'Spanish', 'Indo-European-Romance'],
['bel', 'bel_Cyrl', 'Belarusian', 'Indo-European-Slavic'],
['bos', 'bos_Latn', 'Bosnian', 'Indo-European-Slavic'],
['bul', 'bul_Cyrl', 'Bulgarian', 'Indo-European-Slavic'],
['ces', 'ces_Latn', 'Czech', 'Indo-European-Slavic'],
['hrv', 'hrv_Latn', 'Croatian', 'Indo-European-Slavic'],
['mkd', 'mkd_Cyrl', 'Macedonian', 'Indo-European-Slavic'],
['pol', 'pol_Latn', 'Polish', 'Indo-European-Slavic'],
['rus', 'rus_Cyrl', 'Russian', 'Indo-European-Slavic'],
['slk', 'slk_Latn', 'Slovak', 'Indo-European-Slavic'],
['slv', 'slv_Latn', 'Slovenian', 'Indo-European-Slavic'],
['srp', 'srp_Cyrl', 'Serbian', 'Indo-European-Slavic'],
['ukr', 'ukr_Cyrl', 'Ukrainian', 'Indo-European-Slavic'],
['asm', 'asm_Beng', 'Assamese', 'Indo-European-Indo-Aryan'],
['ben', 'ben_Beng', 'Bengali', 'Indo-European-Indo-Aryan'],
['guj', 'guj_Gujr', 'Gujarati', 'Indo-European-Indo-Aryan'],
['hin', 'hin_Deva', 'Hindi', 'Indo-European-Indo-Aryan'],
['mar', 'mar_Deva', 'Marathi', 'Indo-European-Indo-Aryan'],
['npi', 'npi_Deva', 'Nepali', 'Indo-European-Indo-Aryan'],
['ory', 'ory_Orya', 'Oriya', 'Indo-European-Indo-Aryan'],
['pan', 'pan_Guru', 'Punjabi', 'Indo-European-Indo-Aryan'],
['snd', 'snd_Arab', 'Sindhi', 'Indo-European-Indo-Aryan'],
['urd', 'urd_Arab', 'Urdu', 'Indo-European-Indo-Aryan'],
['ckb', 'ckb_Arab', 'Kurdish', 'Indo-European-Other'],
['cym', 'cym_Latn', 'Welsh', 'Indo-European-Other'],
['ell', 'ell_Grek', 'Greek', 'Indo-European-Other'],
['fas', 'pes_Arab', 'Persian', 'Indo-European-Other'],
['gle', 'gle_Latn', 'Irish', 'Indo-European-Other'],
['hye', 'hye_Armn', 'Armenian', 'Indo-European-Other'],
['ita', 'ita_Latn', 'Italian', 'Indo-European-Other'],
['lav', 'lvs_Latn', 'Latvian', 'Indo-European-Other'],
['lit', 'lit_Latn', 'Lithuanian', 'Indo-European-Other'],
['pus', 'pbt_Arab', 'Pashto', 'Indo-European-Other'],
['tgk', 'tgk_Cyrl', 'Tajik', 'Indo-European-Other'],
['ceb', 'ceb_Latn', 'Cebuano', 'Austronesian'],
['ind', 'ind_Latn', 'Indonesian', 'Austronesian'],
['jav', 'jav_Latn', 'Javanese', 'Austronesian'],
['mri', 'mri_Latn', 'Maori', 'Austronesian'],
['msa', 'zsm_Latn', 'Malay', 'Austronesian'],
['tgl', 'tgl_Latn', 'Tagalog', 'Austronesian'],
['ibo', 'ibo_Latn', 'Igbo', 'Atlantic-Congo'],
['kam', 'kam_Latn', 'Kamba', 'Atlantic-Congo'],
['kea', 'kea_Latn', 'Kabuverdianu', 'Atlantic-Congo'],
['lin', 'lin_Latn', 'Lingala', 'Atlantic-Congo'],
['lug', 'lug_Latn', 'Luganda', 'Atlantic-Congo'],
['nso', 'nso_Latn', 'Northern Sotho', 'Atlantic-Congo'],
['nya', 'nya_Latn', 'Nyanja', 'Atlantic-Congo'],
['sna', 'sna_Latn', 'Shona', 'Atlantic-Congo'],
['swh', 'swh_Latn', 'Swahili', 'Atlantic-Congo'],
['umb', 'umb_Latn', 'Umbundu', 'Atlantic-Congo'],
['wol', 'wol_Latn', 'Wolof', 'Atlantic-Congo'],
['xho', 'xho_Latn', 'Xhosa', 'Atlantic-Congo'],
['yor', 'yor_Latn', 'Yoruba', 'Atlantic-Congo'],
['zul', 'zul_Latn', 'Zulu', 'Atlantic-Congo'],
['amh', 'amh_Ethi', 'Amharic', 'Afro-Asiatic'],
['ara', 'arb_Arab', 'Arabic', 'Afro-Asiatic'],
['ful', 'fuv_Latn', 'Fulah', 'Afro-Asiatic'],
['mlt', 'mlt_Latn', 'Maltese', 'Afro-Asiatic'],
['orm', 'gaz_Latn', 'Oromo', 'Afro-Asiatic'],
['som', 'som_Latn', 'Somali', 'Afro-Asiatic'],
['azj', 'azj_Latn', 'Azerbaijani', 'Turkic'],
['kaz', 'kaz_Cyrl', 'Kazakh', 'Turkic'],
['kir', 'kir_Cyrl', 'Kyrgyz', 'Turkic'],
['tur', 'tur_Latn', 'Turkish', 'Turkic'],
['uzb', 'uzn_Latn', 'Uzbek', 'Turkic'],
['kan', 'kan_Knda', 'Kannada', 'Dravidian'],
['mal', 'mal_Mlym', 'Malayalam', 'Dravidian'],
['tam', 'tam_Taml', 'Tamil', 'Dravidian'],
['tel', 'tel_Telu', 'Telugu', 'Dravidian'],
['mya', 'mya_Mymr', 'Burmese', 'Sino-Tibetan'],
['zho_simpl', 'zho_Hans', 'Chinese (Simpl)', 'Sino-Tibetan'],
['zho_trad', 'zho_Hant', 'Chinese (Trad)', 'Sino-Tibetan'],
['est', 'est_Latn', 'Estonian', 'Other'],
['fin', 'fin_Latn', 'Finnish', 'Other'],
['hau', 'hau_Latn', 'Hausa', 'Other'],
['heb', 'heb_Hebr', 'Hebrew', 'Other'],
['hun', 'hun_Latn', 'Hungarian', 'Other'],
['jpn', 'jpn_Jpan', 'Japanese', 'Other'],
['kat', 'kat_Geor', 'Georgian', 'Other'],
['khm', 'khm_Khmr', 'Khmer', 'Other'],
['kor', 'kor_Hang', 'Korean', 'Other'],
['lao', 'lao_Laoo', 'Lao', 'Other'],
['luo', 'luo_Latn', 'Luo', 'Other'],
['mon', 'khk_Cyrl', 'Mongolian', 'Other'],
['tha', 'tha_Thai', 'Thai', 'Other'],
['vie', 'vie_Latn', 'Vietnamese', 'Other'],
2023-07-05 10:33:12 +08:00
]
flores_lang_map = {i[0]: i for i in _flores_lang_map}
2024-05-14 15:35:58 +08:00
_flores_subtasks = [f'eng-{i}' for i in flores_lang_map if i != 'eng'
] + [f'{i}-eng' for i in flores_lang_map if i != 'eng']
2023-07-05 10:33:12 +08:00
flores_datasets = []
for _flores_subtask in _flores_subtasks:
2024-05-14 15:35:58 +08:00
_src, _tgt = _flores_subtask.split('-')
2023-07-05 10:33:12 +08:00
_, _flores_source, _src_inst, _ = flores_lang_map[_src]
_, _flores_target, _tgt_inst, _ = flores_lang_map[_tgt]
flores_reader_cfg = dict(
2024-05-14 15:35:58 +08:00
input_columns=f'sentence_{_flores_source}',
output_column=f'sentence_{_flores_target}',
train_split='dev',
test_split='devtest'
)
2023-07-05 10:33:12 +08:00
flores_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
2024-05-14 15:35:58 +08:00
begin='</E>',
2023-07-05 10:33:12 +08:00
round=[
dict(
2024-05-14 15:35:58 +08:00
role='HUMAN',
2023-07-05 10:33:12 +08:00
prompt=
2024-05-14 15:35:58 +08:00
f'Translate the following {_src_inst} statements to {_tgt_inst}.\n{{sentence_{_flores_source}}}'
2023-07-05 10:33:12 +08:00
),
2024-05-14 15:35:58 +08:00
dict(role='BOT', prompt=f'{{sentence_{_flores_target}}}'),
2023-07-05 10:33:12 +08:00
],
),
2024-05-14 15:35:58 +08:00
ice_token='</E>',
2023-07-05 10:33:12 +08:00
),
retriever=dict(type=TopkRetriever, ice_num=8),
inferencer=dict(type=GenInferencer),
)
flores_eval_cfg = dict(
evaluator=dict(type=BleuEvaluator),
2024-05-14 15:35:58 +08:00
pred_role='BOT',
2023-07-05 10:33:12 +08:00
)
2024-05-14 15:35:58 +08:00
if _tgt == 'zho_simpl':
flores_eval_cfg['pred_postprocessor'] = dict(type='flores')
flores_eval_cfg['dataset_postprocessor'] = dict(type='flores')
2023-07-05 10:33:12 +08:00
flores_datasets.append(
dict(
2024-05-14 15:35:58 +08:00
abbr=f'flores_100_{_src}-{_tgt}',
type=FloresFirst100Dataset,
[Feature] Support ModelScope datasets (#1289) * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * udpate dataset for modelscope support * update readme * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * update readme * remove tydiqa japanese subset * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * update readme * udpate dataset for modelscope support * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * remove tydiqa japanese subset * update util * remove .DS_Store * fix md format * move util into package * update docs/get_started.md * restore eval_api_zhipu_v2.py, add environment setting * Update dataset * Update * Update * Update * Update --------- Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local> Co-authored-by: Yunnglin <mao.looper@qq.com> Co-authored-by: Yun lin <yunlin@laptop.local> Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn> Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
2024-07-29 13:48:32 +08:00
path='opencompass/flores',
2024-05-14 15:35:58 +08:00
name=f'{_flores_source}-{_flores_target}',
reader_cfg=flores_reader_cfg.copy(),
2023-07-05 10:33:12 +08:00
infer_cfg=flores_infer_cfg.copy(),
eval_cfg=flores_eval_cfg.copy(),
))