OpenCompass/opencompass/datasets/agieval/dataset_loader.py

# flake8: noqa
import ast
import json
import os
from os import environ

import pandas as pd
import tiktoken
from tqdm import tqdm

from .constructions import ChatGPTSchema, ResultsForHumanSchema
from .utils import extract_answer, read_jsonl, save_jsonl

# define the datasets
english_qa_datasets = [
    'lsat-ar', 'lsat-lr', 'lsat-rc', 'logiqa-en', 'sat-math', 'sat-en',
    'aqua-rat', 'sat-en-without-passage', 'gaokao-english'
]
chinese_qa_datasets = [
    'logiqa-zh', 'jec-qa-kd', 'jec-qa-ca', 'gaokao-chinese',
    'gaokao-geography', 'gaokao-history', 'gaokao-biology', 'gaokao-chemistry',
    'gaokao-physics', 'gaokao-mathqa'
]
english_cloze_datasets = ['math']
chinese_cloze_datasets = ['gaokao-mathcloze']

multi_choice_datasets = ['jec-qa-kd', 'jec-qa-ca', 'gaokao-physics']
math_output_datasets = ['gaokao-mathcloze', 'math']


def convert_zero_shot(line, dataset_name):
    try:
        passage = line['passage'] if line['passage'] is not None else ''
        if dataset_name in english_qa_datasets:
            option_string = 'ABCDEFG'
            count = len(line['options'])
            if count == 1:
                count = 5
            return passage + 'Q: '  + line['question'] + ' ' \
                + 'Answer Choices: ' + ' '.join(line['options']) + '\n' + \
                'A: Among A through {}, the answer is'.format(option_string[count - 1])

        elif dataset_name in chinese_qa_datasets:
            option_string = 'ABCDEFG'
            count = len(line['options'])
            if count == 1:
                count = 4
            return passage + '问题：' + line['question'] + ' ' \
                + '选项：' + ' '.join(line['options']) + '\n' + \
                '答案：从A到{}, 我们应选择'.format(option_string[count - 1])

        elif dataset_name in english_cloze_datasets:
            return passage + 'Q: ' + line['question'] + '\n' \
                                              'A: The answer is'

        elif dataset_name in chinese_cloze_datasets:
            return passage + '问题：' + line['question'] + '\n' \
                                                '答案：'
    except NameError:
        print('Dataset not defined.')


prefix = '该问题为单选题，所有选项中必有一个正确答案，且只有一个正确答案。\n'


def convert_zero_shot_CoT_stage1(line, dataset_name):
    try:
        passage = line['passage'] if line['passage'] is not None else ''
        if dataset_name in english_qa_datasets:
            return passage + 'Q: ' + line['question'] + ' ' \
                + 'Answer Choices: ' + ' '.join(line['options']) + '\n' + \
                "Let's think step by step."

        elif dataset_name in chinese_qa_datasets:
            option_string = 'ABCDEFG'
            count = len(line['options'])
            if count == 1:
                count = 4
            return passage + '问题：' + line['question'] + ' ' \
                + '选项：' + ' '.join(line['options']) + '\n' + \
                '从A到{}, 我们应选择什么？让我们逐步思考：'.format(option_string[count - 1])

        elif dataset_name in english_cloze_datasets:
            return passage + 'Q: ' + line['question'] + '\n' \
                                              "A: Let's think step by step."

        elif dataset_name in chinese_cloze_datasets:
            return passage + '问题：' + line['question'] + '\n' \
                                                '答案：让我们逐步思考：'
    except NameError:
        print('Dataset not defined.')


# process few-shot raw_prompts
def combine_prompt(prompt_path,
                   dataset_name,
                   load_explanation=True,
                   chat_mode=False):
    skip_passage = False
    if dataset_name == 'sat-en-without-passage':
        skip_passage = True
        dataset_name = 'sat-en'
    demostrations = []
    # read the prompts by context and explanation
    context_row = [0, 1, 3, 5, 7, 9]
    explanation_row = [0, 2, 4, 6, 8, 10]
    raw_prompts_context = pd.read_csv(prompt_path,
                                      header=0,
                                      skiprows=lambda x: x not in context_row,
                                      keep_default_na=False)
    raw_prompts_explanation = pd.read_csv(
        prompt_path,
        header=0,
        skiprows=lambda x: x not in explanation_row,
        keep_default_na=False).replace(r'\n\n', '\n', regex=True)
    contexts = []
    for line in list(raw_prompts_context[dataset_name]):
        if line:
            # print(line)
            contexts.append(ast.literal_eval(line))
    explanations = [
        exp for exp in raw_prompts_explanation[dataset_name] if exp
    ]

    for idx, (con, exp) in enumerate(zip(contexts, explanations)):
        passage = con['passage'] if con[
            'passage'] is not None and not skip_passage else ''
        question = con['question']
        options = con['options'] if con['options'] is not None else ''
        label = con['label'] if con['label'] is not None else ''
        answer = con[
            'answer'] if 'answer' in con and con['answer'] is not None else ''

        if dataset_name in english_qa_datasets:
            question_input = 'Problem {}.   '.format(idx + 1) + passage + ' ' + question + '\n' \
                              + 'Choose from the following options:    ' + ' '.join(options) + '\n'
            question_output = (('Explanation for Problem {}:   '.format(idx + 1) + exp + '\n') if load_explanation else '') \
                              + 'The answer is therefore {}'.format(label)

        elif dataset_name in chinese_qa_datasets:
            question_input = '问题 {}.   '.format(idx + 1) + passage + ' ' + question + '\n' \
                              + '从以下选项中选择:    ' + ' '.join(options) + '\n'
            question_output = (('问题 {}的解析:   '.format(idx + 1) + exp + '\n') if load_explanation else '') \
                              + '答案是 {}'.format(label)

        elif dataset_name in english_cloze_datasets:
            question_input = 'Problem {}.   '.format(idx + 1) + question + '\n'
            question_output = (('Explanation for Problem {}:   '.format(idx + 1) + exp + '\n') if load_explanation else '') \
                              + 'The answer is therefore {}'.format(answer)

        elif dataset_name in chinese_cloze_datasets:
            question_input = '问题 {}.   '.format(idx + 1) + question + '\n'
            question_output = (('问题 {}的解析:   '.format(idx + 1) + exp + '\n') if load_explanation else '') \
                              + '答案是 {}'.format(answer)
        else:
            raise ValueError(
                f'During loading few-sot examples, found unknown dataset: {dataset_name}'
            )
        if chat_mode:
            demostrations.append((question_input, question_output))
        else:
            demostrations.append(question_input + question_output + '\n')

    return demostrations


enc = None


def _lazy_load_enc():
    global enc
    if enc is None:
        enc = tiktoken.encoding_for_model('gpt-4')


# cut prompt if reach max token length
def concat_prompt(demos,
                  dataset_name,
                  max_tokens,
                  end_of_example='\n',
                  verbose=False):
    _lazy_load_enc()
    demostration_en = 'Here are the answers for the problems in the exam.\n'
    demostration_zh = '以下是考试中各个问题的答案。\n'

    for i in range(len(demos)):
        # print(len(enc.encode(demostration_en)), len(enc.encode(demostration_zh)))
        if dataset_name in english_qa_datasets:
            demostration_en = demostration_en + demos[i] + end_of_example
        elif dataset_name in chinese_qa_datasets:
            demostration_zh = demostration_zh + demos[i] + end_of_example
        elif dataset_name in english_cloze_datasets:
            demostration_en = demostration_en + demos[i] + end_of_example
        elif dataset_name in chinese_cloze_datasets:
            demostration_zh = demostration_zh + demos[i] + end_of_example
        # break if reach max token limit
        if len(enc.encode(demostration_en)) < max_tokens and len(
                enc.encode(demostration_zh)) < max_tokens:
            output = demostration_en if len(demostration_en) > len(
                demostration_zh) else demostration_zh
            prompt_num = i + 1
        else:
            break
    if verbose:
        print('max_tokens set as ', max_tokens, 'actual_tokens is',
              len(enc.encode(output)), 'num_shot is', prompt_num)
    return output, prompt_num


def concat_prompt_chat_mode(demos,
                            dataset_name,
                            max_tokens,
                            end_of_example='\n',
                            verbose=False):
    _lazy_load_enc()
    answers = []
    sentences = ''
    for i in range(len(demos)):
        answers += [
            {
                'role': 'user',
                'content': demos[i][0]
            },
            {
                'role': 'assistant',
                'content': demos[i][1]
            },
        ]
        sentences += json.dumps(answers[-1])
        # break if reach max token limit
        if len(enc.encode(sentences)) > max_tokens:
            answers.pop()
            answers.pop()
            break
    if verbose:
        print('max_tokens set as ', max_tokens, 'actual_tokens is',
              len(enc.encode(sentences)), 'num_shot is',
              len(answers) // 2)
    return answers, len(answers) // 2


def convert_few_shot(line, dataset_name, demo, n_shot, chat_mode=False):
    passage = line['passage'] if line['passage'] is not None else ''
    question = line['question']
    options = line['options'] if line['options'] is not None else ''

    if dataset_name in english_qa_datasets:
        question_input = 'Problem {}.   '.format(n_shot + 1) + passage + ' ' + question + '\n' \
            + 'Choose from the following options:    ' + ' '.join(options) + '\n'
        # + "Explanation for Problem {}:   ".format(n_shot + 1)

    if dataset_name in chinese_qa_datasets:
        question_input = '问题 {}.   '.format(n_shot + 1) + passage + ' ' + question + '\n' \
            + '从以下选项中选择:    ' + ' '.join(options) + '\n'
        # + "问题 {}的解析:   ".format(n_shot + 1)

    if dataset_name in english_cloze_datasets:
        question_input = 'Problem {}.   '.format(n_shot + 1) + question + '\n'
        # + "Explanation for Problem {}:   ".format(n_shot + 1)

    if dataset_name in chinese_cloze_datasets:
        question_input = '问题 {}.   '.format(n_shot + 1) + question + '\n'
        # + "问题 {}的解析:   ".format(n_shot + 1)
    if chat_mode:
        return demo + [
            {
                'role': 'user',
                'content': question_input
            },
        ]
    else:
        return demo + question_input


def load_dataset(dataset_name,
                 setting_name,
                 parent_path,
                 prompt_path=None,
                 max_tokens=None,
                 end_of_example='\n',
                 chat_mode=False,
                 verbose=False):

    if environ.get('DATASET_SOURCE') == 'ModelScope':
        from modelscope import MsDataset
        loaded_jsonl = MsDataset.load(parent_path,
                                      subset_name=dataset_name,
                                      split='test')
    else:
        test_path = os.path.join(parent_path, dataset_name + '.jsonl')
        loaded_jsonl = read_jsonl(test_path)
    processed = []
    if setting_name == 'few-shot-CoT' or setting_name == 'few-shot':
        # process demo once if it is few-shot-CoT
        processed_demos = combine_prompt(
            prompt_path,
            dataset_name,
            load_explanation=setting_name == 'few-shot-CoT',
            chat_mode=chat_mode)
        if chat_mode:
            chosen_prompt, n_shot = concat_prompt_chat_mode(processed_demos,
                                                            dataset_name,
                                                            max_tokens,
                                                            end_of_example,
                                                            verbose=verbose)
        else:
            chosen_prompt, n_shot = concat_prompt(processed_demos,
                                                  dataset_name,
                                                  max_tokens,
                                                  end_of_example,
                                                  verbose=verbose)
    if verbose:
        loaded_jsonl = tqdm(loaded_jsonl)
    for meta_idx, line in enumerate(loaded_jsonl):
        if setting_name == 'zero-shot':
            ctxt = convert_zero_shot(line, dataset_name)
        elif setting_name == 'zero-shot-CoT':
            ctxt = convert_zero_shot_CoT_stage1(line, dataset_name)
        elif setting_name == 'few-shot-CoT' or setting_name == 'few-shot':
            ctxt = convert_few_shot(line, dataset_name, chosen_prompt, n_shot,
                                    chat_mode)
        try:
            new_instance = ChatGPTSchema(context=ctxt, metadata=meta_idx)
            processed.append(new_instance.to_dict())
        except NameError:
            print('Dataset not defined.')
    return processed


def generate_second_stage_input(dataset_name,
                                input_list,
                                output_list,
                                with_format_prompt=False):
    try:
        english_format_prompt = 'Based on the previous results, your task is to extract the final answer and provide the output enclosed in brackets【】, such as 【0】 or 【A】.'
        chinese_format_prompt = '根据以上内容，你的任务是把最终的答案提取出来并填在【】中，例如【0】或者【A】。'
        if dataset_name in english_qa_datasets:
            prompt_suffix = 'Therefore, among A through E, the answer is'
            if with_format_prompt:
                prompt_suffix = english_format_prompt + prompt_suffix
        elif dataset_name in chinese_qa_datasets:
            prompt_suffix = '因此，从A到D, 我们应选择'
            if with_format_prompt:
                prompt_suffix = chinese_format_prompt + prompt_suffix
        elif dataset_name in english_cloze_datasets:
            prompt_suffix = 'Therefore, the answer is'
            if with_format_prompt:
                prompt_suffix = english_format_prompt + prompt_suffix
        elif dataset_name in chinese_cloze_datasets:
            prompt_suffix = '因此，答案是'
            if with_format_prompt:
                prompt_suffix = chinese_format_prompt + prompt_suffix
    except NameError:
        print('Dataset not defined.')
    processed = []
    for i in range(len(input_list)):
        ctxt = '{0}\n{1}\n{2}'.format(input_list[i]['context'],
                                      extract_answer(output_list[i]),
                                      prompt_suffix)
        new_instance = ChatGPTSchema(context=ctxt,
                                     metadata=input_list[i]['metadata'])
        processed.append(new_instance.to_dict())
    return processed


def load_dataset_as_result_schema(dataset_name, parent_path):

    if environ.get('DATASET_SOURCE') == 'ModelScope':
        from modelscope import MsDataset
        loaded_jsonl = MsDataset.load(parent_path,
                                      subset_name=dataset_name,
                                      split='test')
    else:
        test_path = os.path.join(parent_path, dataset_name + '.jsonl')
        loaded_jsonl = read_jsonl(test_path)

    processed = []
    for i, line in enumerate(loaded_jsonl):
        problem_input = convert_zero_shot(line, dataset_name)
        processed.append(
            ResultsForHumanSchema(
                index=i,
                problem_input=problem_input,
                label=line['label'] if line['label'] else line['answer'],
            ))
    return processed


if __name__ == '__main__':

    # set variables
    parent_dir = '../../data/V1_1/'
    raw_prompt_path = '../data/few_shot_prompts.csv'

    # set dataset name to process
    setting_name = 'few-shot-CoT'  # setting_name can be chosen from ["zero-shot", "zero-shot-CoT", "few-shot-CoT"]
    data_name = 'jec-qa-kd'
    save_dir = '../../experiment_input/{}/'.format(setting_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    processed_data = load_dataset(data_name,
                                  setting_name,
                                  parent_dir,
                                  prompt_path=raw_prompt_path,
                                  max_tokens=2048)
    save_jsonl(processed_data,
               os.path.join(save_dir, '{}.jsonl'.format(data_name)))
-												initial commit

											
										
										
											2023-07-04 21:34:55 +08:00
+								# flake8: noqa
 								import ast
 								import json
 								import os
-												[Feature] Support ModelScope datasets (#1289)

* add ceval, gsm8k modelscope surpport

* update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest

* update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets

* format file

* format file

* update dataset format

* support ms_dataset

* udpate dataset for modelscope support

* merge myl_dev and update test_ms_dataset

* udpate dataset for modelscope support

* update readme

* update eval_api_zhipu_v2

* remove unused code

* add get_data_path function

* update readme

* remove tydiqa japanese subset

* add ceval, gsm8k modelscope surpport

* update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest

* update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets

* format file

* format file

* update dataset format

* support ms_dataset

* udpate dataset for modelscope support

* merge myl_dev and update test_ms_dataset

* update readme

* udpate dataset for modelscope support

* update eval_api_zhipu_v2

* remove unused code

* add get_data_path function

* remove tydiqa japanese subset

* update util

* remove .DS_Store

* fix md format

* move util into package

* update docs/get_started.md

* restore eval_api_zhipu_v2.py, add environment setting

* Update dataset

* Update

* Update

* Update

* Update

---------

Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local>
Co-authored-by: Yunnglin <mao.looper@qq.com>
Co-authored-by: Yun lin <yunlin@laptop.local>
Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn>
Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-29 13:48:32 +08:00
+								from os import environ
-												initial commit

											
										
										
											2023-07-04 21:34:55 +08:00
 								import pandas as pd
 								import tiktoken
 								from tqdm import tqdm
 								from .constructions import ChatGPTSchema, ResultsForHumanSchema
 								from .utils import extract_answer, read_jsonl, save_jsonl
 								# define the datasets
 								english_qa_datasets = [
 								    'lsat-ar', 'lsat-lr', 'lsat-rc', 'logiqa-en', 'sat-math', 'sat-en',
 								    'aqua-rat', 'sat-en-without-passage', 'gaokao-english'
 								]
 								chinese_qa_datasets = [
 								    'logiqa-zh', 'jec-qa-kd', 'jec-qa-ca', 'gaokao-chinese',
 								    'gaokao-geography', 'gaokao-history', 'gaokao-biology', 'gaokao-chemistry',
 								    'gaokao-physics', 'gaokao-mathqa'
 								]
 								english_cloze_datasets = ['math']
 								chinese_cloze_datasets = ['gaokao-mathcloze']
 								multi_choice_datasets = ['jec-qa-kd', 'jec-qa-ca', 'gaokao-physics']
 								math_output_datasets = ['gaokao-mathcloze', 'math']
 								def convert_zero_shot(line, dataset_name):
 								    try:
 								        passage = line['passage'] if line['passage'] is not None else ''
 								        if dataset_name in english_qa_datasets:
 								            option_string = 'ABCDEFG'
 								            count = len(line['options'])
 								            if count == 1:
 								                count = 5
 								            return passage + 'Q: '  + line['question'] + ' ' \
 								                + 'Answer Choices: ' + ' '.join(line['options']) + '\n' + \
 								                'A: Among A through {}, the answer is'.format(option_string[count - 1])
 								        elif dataset_name in chinese_qa_datasets:
 								            option_string = 'ABCDEFG'
 								            count = len(line['options'])
 								            if count == 1:
 								                count = 4
 								            return passage + '问题：' + line['question'] + ' ' \
 								                + '选项：' + ' '.join(line['options']) + '\n' + \
 								                '答案：从A到{}, 我们应选择'.format(option_string[count - 1])
 								        elif dataset_name in english_cloze_datasets:
 								            return passage + 'Q: ' + line['question'] + '\n' \
 								                                              'A: The answer is'
 								        elif dataset_name in chinese_cloze_datasets:
 								            return passage + '问题：' + line['question'] + '\n' \
 								                                                '答案：'
 								    except NameError:
 								        print('Dataset not defined.')
 								prefix = '该问题为单选题，所有选项中必有一个正确答案，且只有一个正确答案。\n'
 								def convert_zero_shot_CoT_stage1(line, dataset_name):
 								    try:
 								        passage = line['passage'] if line['passage'] is not None else ''
 								        if dataset_name in english_qa_datasets:
 								            return passage + 'Q: ' + line['question'] + ' ' \
 								                + 'Answer Choices: ' + ' '.join(line['options']) + '\n' + \
 								                "Let's think step by step."
 								        elif dataset_name in chinese_qa_datasets:
 								            option_string = 'ABCDEFG'
 								            count = len(line['options'])
 								            if count == 1:
 								                count = 4
 								            return passage + '问题：' + line['question'] + ' ' \
 								                + '选项：' + ' '.join(line['options']) + '\n' + \
 								                '从A到{}, 我们应选择什么？让我们逐步思考：'.format(option_string[count - 1])
 								        elif dataset_name in english_cloze_datasets:
 								            return passage + 'Q: ' + line['question'] + '\n' \
 								                                              "A: Let's think step by step."
 								        elif dataset_name in chinese_cloze_datasets:
 								            return passage + '问题：' + line['question'] + '\n' \
 								                                                '答案：让我们逐步思考：'
 								    except NameError:
 								        print('Dataset not defined.')
 								# process few-shot raw_prompts
 								def combine_prompt(prompt_path,
 								                   dataset_name,
 								                   load_explanation=True,
 								                   chat_mode=False):
 								    skip_passage = False
 								    if dataset_name == 'sat-en-without-passage':
 								        skip_passage = True
 								        dataset_name = 'sat-en'
 								    demostrations = []
 								    # read the prompts by context and explanation
 								    context_row = [0, 1, 3, 5, 7, 9]
 								    explanation_row = [0, 2, 4, 6, 8, 10]
 								    raw_prompts_context = pd.read_csv(prompt_path,
 								                                      header=0,
 								                                      skiprows=lambda x: x not in context_row,
 								                                      keep_default_na=False)
 								    raw_prompts_explanation = pd.read_csv(
 								        prompt_path,
 								        header=0,
 								        skiprows=lambda x: x not in explanation_row,
 								        keep_default_na=False).replace(r'\n\n', '\n', regex=True)
 								    contexts = []
 								    for line in list(raw_prompts_context[dataset_name]):
 								        if line:
 								            # print(line)
 								            contexts.append(ast.literal_eval(line))
 								    explanations = [
 								        exp for exp in raw_prompts_explanation[dataset_name] if exp
 								    ]
 								    for idx, (con, exp) in enumerate(zip(contexts, explanations)):
 								        passage = con['passage'] if con[
 								            'passage'] is not None and not skip_passage else ''
 								        question = con['question']
 								        options = con['options'] if con['options'] is not None else ''
 								        label = con['label'] if con['label'] is not None else ''
 								        answer = con[
 								            'answer'] if 'answer' in con and con['answer'] is not None else ''
 								        if dataset_name in english_qa_datasets:
 								            question_input = 'Problem {}.   '.format(idx + 1) + passage + ' ' + question + '\n' \
 								                              + 'Choose from the following options:    ' + ' '.join(options) + '\n'
 								            question_output = (('Explanation for Problem {}:   '.format(idx + 1) + exp + '\n') if load_explanation else '') \
 								                              + 'The answer is therefore {}'.format(label)
 								        elif dataset_name in chinese_qa_datasets:
 								            question_input = '问题 {}.   '.format(idx + 1) + passage + ' ' + question + '\n' \
 								                              + '从以下选项中选择:    ' + ' '.join(options) + '\n'
 								            question_output = (('问题 {}的解析:   '.format(idx + 1) + exp + '\n') if load_explanation else '') \
 								                              + '答案是 {}'.format(label)
 								        elif dataset_name in english_cloze_datasets:
 								            question_input = 'Problem {}.   '.format(idx + 1) + question + '\n'
 								            question_output = (('Explanation for Problem {}:   '.format(idx + 1) + exp + '\n') if load_explanation else '') \
 								                              + 'The answer is therefore {}'.format(answer)
 								        elif dataset_name in chinese_cloze_datasets:
 								            question_input = '问题 {}.   '.format(idx + 1) + question + '\n'
 								            question_output = (('问题 {}的解析:   '.format(idx + 1) + exp + '\n') if load_explanation else '') \
 								                              + '答案是 {}'.format(answer)
 								        else:
 								            raise ValueError(
 								                f'During loading few-sot examples, found unknown dataset: {dataset_name}'
 								            )
 								        if chat_mode:
 								            demostrations.append((question_input, question_output))
 								        else:
 								            demostrations.append(question_input + question_output + '\n')
 								    return demostrations
 								enc = None
 								def _lazy_load_enc():
 								    global enc
 								    if enc is None:
 								        enc = tiktoken.encoding_for_model('gpt-4')
 								# cut prompt if reach max token length
 								def concat_prompt(demos,
 								                  dataset_name,
 								                  max_tokens,
 								                  end_of_example='\n',
 								                  verbose=False):
 								    _lazy_load_enc()
 								    demostration_en = 'Here are the answers for the problems in the exam.\n'
 								    demostration_zh = '以下是考试中各个问题的答案。\n'
 								    for i in range(len(demos)):
 								        # print(len(enc.encode(demostration_en)), len(enc.encode(demostration_zh)))
 								        if dataset_name in english_qa_datasets:
 								            demostration_en = demostration_en + demos[i] + end_of_example
 								        elif dataset_name in chinese_qa_datasets:
 								            demostration_zh = demostration_zh + demos[i] + end_of_example
 								        elif dataset_name in english_cloze_datasets:
 								            demostration_en = demostration_en + demos[i] + end_of_example
 								        elif dataset_name in chinese_cloze_datasets:
 								            demostration_zh = demostration_zh + demos[i] + end_of_example
 								        # break if reach max token limit
 								        if len(enc.encode(demostration_en)) < max_tokens and len(
 								                enc.encode(demostration_zh)) < max_tokens:
 								            output = demostration_en if len(demostration_en) > len(
 								                demostration_zh) else demostration_zh
 								            prompt_num = i + 1
 								        else:
 								            break
 								    if verbose:
 								        print('max_tokens set as ', max_tokens, 'actual_tokens is',
 								              len(enc.encode(output)), 'num_shot is', prompt_num)
 								    return output, prompt_num
 								def concat_prompt_chat_mode(demos,
 								                            dataset_name,
 								                            max_tokens,
 								                            end_of_example='\n',
 								                            verbose=False):
 								    _lazy_load_enc()
 								    answers = []
 								    sentences = ''
 								    for i in range(len(demos)):
 								        answers += [
 								            {
 								                'role': 'user',
 								                'content': demos[i][0]
 								            },
 								            {
 								                'role': 'assistant',
 								                'content': demos[i][1]
 								            },
 								        ]
 								        sentences += json.dumps(answers[-1])
 								        # break if reach max token limit
 								        if len(enc.encode(sentences)) > max_tokens:
 								            answers.pop()
 								            answers.pop()
 								            break
 								    if verbose:
 								        print('max_tokens set as ', max_tokens, 'actual_tokens is',
 								              len(enc.encode(sentences)), 'num_shot is',
 								              len(answers) // 2)
 								    return answers, len(answers) // 2
 								def convert_few_shot(line, dataset_name, demo, n_shot, chat_mode=False):
 								    passage = line['passage'] if line['passage'] is not None else ''
 								    question = line['question']
 								    options = line['options'] if line['options'] is not None else ''
 								    if dataset_name in english_qa_datasets:
 								        question_input = 'Problem {}.   '.format(n_shot + 1) + passage + ' ' + question + '\n' \
 								            + 'Choose from the following options:    ' + ' '.join(options) + '\n'
 								        # + "Explanation for Problem {}:   ".format(n_shot + 1)
 								    if dataset_name in chinese_qa_datasets:
 								        question_input = '问题 {}.   '.format(n_shot + 1) + passage + ' ' + question + '\n' \
 								            + '从以下选项中选择:    ' + ' '.join(options) + '\n'
 								        # + "问题 {}的解析:   ".format(n_shot + 1)
 								    if dataset_name in english_cloze_datasets:
 								        question_input = 'Problem {}.   '.format(n_shot + 1) + question + '\n'
 								        # + "Explanation for Problem {}:   ".format(n_shot + 1)
 								    if dataset_name in chinese_cloze_datasets:
 								        question_input = '问题 {}.   '.format(n_shot + 1) + question + '\n'
 								        # + "问题 {}的解析:   ".format(n_shot + 1)
 								    if chat_mode:
 								        return demo + [
 								            {
 								                'role': 'user',
 								                'content': question_input
 								            },
 								        ]
 								    else:
 								        return demo + question_input
 								def load_dataset(dataset_name,
 								                 setting_name,
 								                 parent_path,
 								                 prompt_path=None,
 								                 max_tokens=None,
 								                 end_of_example='\n',
 								                 chat_mode=False,
 								                 verbose=False):
-												[Feature] Support ModelScope datasets (#1289)

* add ceval, gsm8k modelscope surpport

* update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest

* update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets

* format file

* format file

* update dataset format

* support ms_dataset

* udpate dataset for modelscope support

* merge myl_dev and update test_ms_dataset

* udpate dataset for modelscope support

* update readme

* update eval_api_zhipu_v2

* remove unused code

* add get_data_path function

* update readme

* remove tydiqa japanese subset

* add ceval, gsm8k modelscope surpport

* update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest

* update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets

* format file

* format file

* update dataset format

* support ms_dataset

* udpate dataset for modelscope support

* merge myl_dev and update test_ms_dataset

* update readme

* udpate dataset for modelscope support

* update eval_api_zhipu_v2

* remove unused code

* add get_data_path function

* remove tydiqa japanese subset

* update util

* remove .DS_Store

* fix md format

* move util into package

* update docs/get_started.md

* restore eval_api_zhipu_v2.py, add environment setting

* Update dataset

* Update

* Update

* Update

* Update

---------

Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local>
Co-authored-by: Yunnglin <mao.looper@qq.com>
Co-authored-by: Yun lin <yunlin@laptop.local>
Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn>
Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-29 13:48:32 +08:00
 								    if environ.get('DATASET_SOURCE') == 'ModelScope':
 								        from modelscope import MsDataset
 								        loaded_jsonl = MsDataset.load(parent_path,
 								                                      subset_name=dataset_name,
 								                                      split='test')
 								    else:
 								        test_path = os.path.join(parent_path, dataset_name + '.jsonl')
 								        loaded_jsonl = read_jsonl(test_path)
-												initial commit

											
										
										
											2023-07-04 21:34:55 +08:00
+								    processed = []
 								    if setting_name == 'few-shot-CoT' or setting_name == 'few-shot':
 								        # process demo once if it is few-shot-CoT
 								        processed_demos = combine_prompt(
 								            prompt_path,
 								            dataset_name,
 								            load_explanation=setting_name == 'few-shot-CoT',
 								            chat_mode=chat_mode)
 								        if chat_mode:
 								            chosen_prompt, n_shot = concat_prompt_chat_mode(processed_demos,
 								                                                            dataset_name,
 								                                                            max_tokens,
 								                                                            end_of_example,
 								                                                            verbose=verbose)
 								        else:
 								            chosen_prompt, n_shot = concat_prompt(processed_demos,
 								                                                  dataset_name,
 								                                                  max_tokens,
 								                                                  end_of_example,
 								                                                  verbose=verbose)
 								    if verbose:
 								        loaded_jsonl = tqdm(loaded_jsonl)
 								    for meta_idx, line in enumerate(loaded_jsonl):
 								        if setting_name == 'zero-shot':
 								            ctxt = convert_zero_shot(line, dataset_name)
 								        elif setting_name == 'zero-shot-CoT':
 								            ctxt = convert_zero_shot_CoT_stage1(line, dataset_name)
 								        elif setting_name == 'few-shot-CoT' or setting_name == 'few-shot':
 								            ctxt = convert_few_shot(line, dataset_name, chosen_prompt, n_shot,
 								                                    chat_mode)
 								        try:
 								            new_instance = ChatGPTSchema(context=ctxt, metadata=meta_idx)
 								            processed.append(new_instance.to_dict())
 								        except NameError:
 								            print('Dataset not defined.')
 								    return processed
 								def generate_second_stage_input(dataset_name,
 								                                input_list,
 								                                output_list,
 								                                with_format_prompt=False):
 								    try:
 								        english_format_prompt = 'Based on the previous results, your task is to extract the final answer and provide the output enclosed in brackets【】, such as 【0】 or 【A】.'
 								        chinese_format_prompt = '根据以上内容，你的任务是把最终的答案提取出来并填在【】中，例如【0】或者【A】。'
 								        if dataset_name in english_qa_datasets:
 								            prompt_suffix = 'Therefore, among A through E, the answer is'
 								            if with_format_prompt:
 								                prompt_suffix = english_format_prompt + prompt_suffix
 								        elif dataset_name in chinese_qa_datasets:
 								            prompt_suffix = '因此，从A到D, 我们应选择'
 								            if with_format_prompt:
 								                prompt_suffix = chinese_format_prompt + prompt_suffix
 								        elif dataset_name in english_cloze_datasets:
 								            prompt_suffix = 'Therefore, the answer is'
 								            if with_format_prompt:
 								                prompt_suffix = english_format_prompt + prompt_suffix
 								        elif dataset_name in chinese_cloze_datasets:
 								            prompt_suffix = '因此，答案是'
 								            if with_format_prompt:
 								                prompt_suffix = chinese_format_prompt + prompt_suffix
 								    except NameError:
 								        print('Dataset not defined.')
 								    processed = []
 								    for i in range(len(input_list)):
 								        ctxt = '{0}\n{1}\n{2}'.format(input_list[i]['context'],
 								                                      extract_answer(output_list[i]),
 								                                      prompt_suffix)
 								        new_instance = ChatGPTSchema(context=ctxt,
 								                                     metadata=input_list[i]['metadata'])
 								        processed.append(new_instance.to_dict())
 								    return processed
 								def load_dataset_as_result_schema(dataset_name, parent_path):
-												[Feature] Support ModelScope datasets (#1289)

* add ceval, gsm8k modelscope surpport

* update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest

* update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets

* format file

* format file

* update dataset format

* support ms_dataset

* udpate dataset for modelscope support

* merge myl_dev and update test_ms_dataset

* udpate dataset for modelscope support

* update readme

* update eval_api_zhipu_v2

* remove unused code

* add get_data_path function

* update readme

* remove tydiqa japanese subset

* add ceval, gsm8k modelscope surpport

* update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest

* update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets

* format file

* format file

* update dataset format

* support ms_dataset

* udpate dataset for modelscope support

* merge myl_dev and update test_ms_dataset

* update readme

* udpate dataset for modelscope support

* update eval_api_zhipu_v2

* remove unused code

* add get_data_path function

* remove tydiqa japanese subset

* update util

* remove .DS_Store

* fix md format

* move util into package

* update docs/get_started.md

* restore eval_api_zhipu_v2.py, add environment setting

* Update dataset

* Update

* Update

* Update

* Update

---------

Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local>
Co-authored-by: Yunnglin <mao.looper@qq.com>
Co-authored-by: Yun lin <yunlin@laptop.local>
Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn>
Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn>
											
										
										
											2024-07-29 13:48:32 +08:00
 								    if environ.get('DATASET_SOURCE') == 'ModelScope':
 								        from modelscope import MsDataset
 								        loaded_jsonl = MsDataset.load(parent_path,
 								                                      subset_name=dataset_name,
 								                                      split='test')
 								    else:
 								        test_path = os.path.join(parent_path, dataset_name + '.jsonl')
 								        loaded_jsonl = read_jsonl(test_path)
-												initial commit

											
										
										
											2023-07-04 21:34:55 +08:00
 								    processed = []
 								    for i, line in enumerate(loaded_jsonl):
 								        problem_input = convert_zero_shot(line, dataset_name)
 								        processed.append(
 								            ResultsForHumanSchema(
 								                index=i,
 								                problem_input=problem_input,
 								                label=line['label'] if line['label'] else line['answer'],
 								            ))
 								    return processed
 								if __name__ == '__main__':
 								    # set variables
 								    parent_dir = '../../data/V1_1/'
 								    raw_prompt_path = '../data/few_shot_prompts.csv'
 								    # set dataset name to process
 								    setting_name = 'few-shot-CoT'  # setting_name can be chosen from ["zero-shot", "zero-shot-CoT", "few-shot-CoT"]
 								    data_name = 'jec-qa-kd'
 								    save_dir = '../../experiment_input/{}/'.format(setting_name)
 								    if not os.path.exists(save_dir):
 								        os.makedirs(save_dir)
 								    processed_data = load_dataset(data_name,
 								                                  setting_name,
 								                                  parent_dir,
 								                                  prompt_path=raw_prompt_path,
 								                                  max_tokens=2048)
 								    save_jsonl(processed_data,
 								               os.path.join(save_dir, '{}.jsonl'.format(data_name)))