mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
38 lines
1020 B
Python
38 lines
1020 B
Python
import re
|
|
|
|
from datasets import DatasetDict, load_dataset
|
|
|
|
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
|
|
|
|
from .base import BaseDataset
|
|
|
|
|
|
@LOAD_DATASET.register_module()
|
|
class FloresFirst100Dataset(BaseDataset):
|
|
|
|
@staticmethod
|
|
def load(name):
|
|
return DatasetDict({
|
|
'dev':
|
|
load_dataset(path='facebook/flores', name=name, split='dev'),
|
|
'devtest':
|
|
load_dataset(path='facebook/flores',
|
|
name=name,
|
|
split='devtest[:100]')
|
|
})
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('flores')
|
|
def flores_postprocess(text: str) -> str:
|
|
text = text.strip().split('\n')[0]
|
|
return text
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('flores-chinese')
|
|
def flores_postprocess_chinese(text: str) -> str:
|
|
import jieba
|
|
truncated_text = text.strip().split('\n')[0]
|
|
cleaned_text = re.sub(r'\s+', ' ', truncated_text).strip()
|
|
cleaned_text = ' '.join(jieba.cut(cleaned_text))
|
|
return cleaned_text
|