OpenCompass/opencompass/datasets/storycloze.py

import json
import os

from datasets import Dataset, DatasetDict

from opencompass.registry import LOAD_DATASET

from .base import BaseDataset


@LOAD_DATASET.register_module()
class storyclozeDataset(BaseDataset):

    @staticmethod
    def load(path, lang):
        dataset_list = []
        for split in ['train', 'eval']:
            split_path = os.path.join(path, f'{lang}_{split}.jsonl')
            with open(split_path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = json.loads(line)
                    line['context'] = ' '.join([
                        line['input_sentence_1'], line['input_sentence_2'],
                        line['input_sentence_3'], line['input_sentence_4']
                    ])
                    dataset_list.append(line)
        dataset_list = Dataset.from_list(dataset_list)
        return DatasetDict({'test': dataset_list})


@LOAD_DATASET.register_module()
class storyclozeDataset_V2(BaseDataset):

    @staticmethod
    def load(path, lang):
        dataset_list = []
        for split in ['train', 'eval']:
            split_path = os.path.join(path, f'{lang}_{split}.jsonl')
            with open(split_path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = json.loads(line)
                    line['context'] = ' '.join([
                        line['input_sentence_1'], line['input_sentence_2'],
                        line['input_sentence_3'], line['input_sentence_4']
                    ])
                    line['answer_right_ending'] = ' AB'[
                        line['answer_right_ending']]
                    dataset_list.append(line)
        dataset_list = Dataset.from_list(dataset_list)
        return dataset_list
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`import json`
			`import os`

			`from datasets import Dataset, DatasetDict`
initial commit 2023-07-04 21:34:55 +08:00
			`from opencompass.registry import LOAD_DATASET`

			`from .base import BaseDataset`


			`@LOAD_DATASET.register_module()`
			`class storyclozeDataset(BaseDataset):`

			`@staticmethod`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`def load(path, lang):`
			`dataset_list = []`
			`for split in ['train', 'eval']:`
			`split_path = os.path.join(path, f'{lang}_{split}.jsonl')`
			`with open(split_path, 'r', encoding='utf-8') as f:`
			`for line in f:`
			`line = json.loads(line)`
			`line['context'] = ' '.join([`
			`line['input_sentence_1'], line['input_sentence_2'],`
			`line['input_sentence_3'], line['input_sentence_4']`
			`])`
			`dataset_list.append(line)`
			`dataset_list = Dataset.from_list(dataset_list)`
			`return DatasetDict({'test': dataset_list})`
initial commit 2023-07-04 21:34:55 +08:00

			`@LOAD_DATASET.register_module()`
			`class storyclozeDataset_V2(BaseDataset):`

			`@staticmethod`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`def load(path, lang):`
			`dataset_list = []`
			`for split in ['train', 'eval']:`
			`split_path = os.path.join(path, f'{lang}_{split}.jsonl')`
			`with open(split_path, 'r', encoding='utf-8') as f:`
			`for line in f:`
			`line = json.loads(line)`
			`line['context'] = ' '.join([`
			`line['input_sentence_1'], line['input_sentence_2'],`
			`line['input_sentence_3'], line['input_sentence_4']`
			`])`
			`line['answer_right_ending'] = ' AB'[`
			`line['answer_right_ending']]`
			`dataset_list.append(line)`
			`dataset_list = Dataset.from_list(dataset_list)`
			`return dataset_list`