OpenCompass/opencompass/datasets/wikibench.py

import copy
import json

from datasets import Dataset

from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path

from .base import BaseDataset


def get_number(options):

    result_string = ''
    for i, option in enumerate(options, start=65):
        result_string += f'{chr(i)}. {option}\n'
    return result_string


@LOAD_DATASET.register_module()
class WikiBenchDataset(BaseDataset):

    @staticmethod
    def load(path: str, name: str):
        path = get_data_path(path, local_mode=True)

        circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']

        data = []
        with open(path, 'r', encoding='utf-8') as infile:
            for id, line in enumerate(infile):
                entry = json.loads(line)
                if 'cloze' in name:
                    data.append({
                        'question': entry['question'].strip(),
                        'answer': entry['answer'].strip()
                    })
                elif 'circular' in name:
                    for c in circular_patterns:
                        line = copy.deepcopy(entry)
                        options = []
                        for i in range(4):
                            options.append(line['options'][ord(c[i]) -
                                                           ord('A')])
                        line['options'] = options
                        line['answer'] = {
                            c[0]: 'A',
                            c[1]: 'B',
                            c[2]: 'C',
                            c[3]: 'D'
                        }[line['answer']]
                        line['answer'] = str(
                            id) + '--' + line['answer'] + '--' + c
                        line['question'] = line['question'].strip(
                        ) + '\n' + get_number(line['options'])
                        data.append(line)
                else:
                    # treat as normal single choice question
                    entry['question'] = entry['question'].strip(
                    ) + '\n' + get_number(entry['options'])
                    data.append(entry)

        dataset = Dataset.from_list(data)
        return dataset
[Feature] Add wikibench dataset (#655) * Add WikiBench * Add WikiBench * format --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-12-01 14:56:54 +08:00			`import copy`
			`import json`

			`from datasets import Dataset`

			`from opencompass.registry import LOAD_DATASET`
[Feature] Support ModelScope datasets (#1289) * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * udpate dataset for modelscope support * update readme * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * update readme * remove tydiqa japanese subset * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * update readme * udpate dataset for modelscope support * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * remove tydiqa japanese subset * update util * remove .DS_Store * fix md format * move util into package * update docs/get_started.md * restore eval_api_zhipu_v2.py, add environment setting * Update dataset * Update * Update * Update * Update --------- Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local> Co-authored-by: Yunnglin <mao.looper@qq.com> Co-authored-by: Yun lin <yunlin@laptop.local> Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn> Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn> 2024-07-29 13:48:32 +08:00			`from opencompass.utils import get_data_path`
[Feature] Add wikibench dataset (#655) * Add WikiBench * Add WikiBench * format --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-12-01 14:56:54 +08:00
			`from .base import BaseDataset`


			`def get_number(options):`

			`result_string = ''`
			`for i, option in enumerate(options, start=65):`
			`result_string += f'{chr(i)}. {option}\n'`
			`return result_string`


			`@LOAD_DATASET.register_module()`
			`class WikiBenchDataset(BaseDataset):`

			`@staticmethod`
			`def load(path: str, name: str):`
[Feature] Support ModelScope datasets (#1289) * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * udpate dataset for modelscope support * update readme * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * update readme * remove tydiqa japanese subset * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * update readme * udpate dataset for modelscope support * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * remove tydiqa japanese subset * update util * remove .DS_Store * fix md format * move util into package * update docs/get_started.md * restore eval_api_zhipu_v2.py, add environment setting * Update dataset * Update * Update * Update * Update --------- Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local> Co-authored-by: Yunnglin <mao.looper@qq.com> Co-authored-by: Yun lin <yunlin@laptop.local> Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn> Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn> 2024-07-29 13:48:32 +08:00			`path = get_data_path(path, local_mode=True)`
[Feature] Add wikibench dataset (#655) * Add WikiBench * Add WikiBench * format --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-12-01 14:56:54 +08:00
			`circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']`

			`data = []`
[Sync] minor test (#683) 2023-12-11 17:42:53 +08:00			`with open(path, 'r', encoding='utf-8') as infile:`
[Feature] Add wikibench dataset (#655) * Add WikiBench * Add WikiBench * format --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-12-01 14:56:54 +08:00			`for id, line in enumerate(infile):`
			`entry = json.loads(line)`
			`if 'cloze' in name:`
			`data.append({`
			`'question': entry['question'].strip(),`
			`'answer': entry['answer'].strip()`
			`})`
			`elif 'circular' in name:`
			`for c in circular_patterns:`
			`line = copy.deepcopy(entry)`
			`options = []`
			`for i in range(4):`
			`options.append(line['options'][ord(c[i]) -`
			`ord('A')])`
			`line['options'] = options`
			`line['answer'] = {`
			`c[0]: 'A',`
			`c[1]: 'B',`
			`c[2]: 'C',`
			`c[3]: 'D'`
			`}[line['answer']]`
			`line['answer'] = str(`
			`id) + '--' + line['answer'] + '--' + c`
			`line['question'] = line['question'].strip(`
			`) + '\n' + get_number(line['options'])`
			`data.append(line)`
			`else:`
			`# treat as normal single choice question`
			`entry['question'] = entry['question'].strip(`
			`) + '\n' + get_number(entry['options'])`
			`data.append(entry)`

			`dataset = Dataset.from_list(data)`
			`return dataset`