mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
34 lines
1.2 KiB
Python
34 lines
1.2 KiB
Python
![]() |
from datasets import concatenate_datasets, load_dataset
|
||
|
|
||
|
from opencompass.registry import LOAD_DATASET
|
||
|
|
||
|
from .base import BaseDataset
|
||
|
|
||
|
|
||
|
@LOAD_DATASET.register_module()
|
||
|
class XLSUMDataset(BaseDataset):
|
||
|
|
||
|
@staticmethod
|
||
|
def load(**kwargs):
|
||
|
path = kwargs.get('path', None)
|
||
|
lans = [
|
||
|
'oromo', 'french', 'amharic', 'arabic', 'azerbaijani', 'bengali',
|
||
|
'burmese', 'chinese_simplified', 'chinese_traditional', 'welsh',
|
||
|
'english', 'kirundi', 'gujarati', 'hausa', 'hindi', 'igbo',
|
||
|
'indonesian', 'japanese', 'korean', 'kyrgyz', 'marathi', 'spanish',
|
||
|
'scottish_gaelic', 'nepali', 'pashto', 'persian', 'pidgin',
|
||
|
'portuguese', 'punjabi', 'russian', 'serbian_cyrillic',
|
||
|
'serbian_latin', 'sinhala', 'somali', 'swahili', 'tamil', 'telugu',
|
||
|
'thai', 'tigrinya', 'turkish', 'ukrainian', 'urdu', 'uzbek',
|
||
|
'vietnamese', 'yoruba'
|
||
|
]
|
||
|
|
||
|
datasets = []
|
||
|
for lan in lans:
|
||
|
dataset = load_dataset(path, lan)['validation']
|
||
|
datasets.append(dataset)
|
||
|
|
||
|
combined_dataset = concatenate_datasets(datasets)
|
||
|
|
||
|
return combined_dataset
|