diff --git a/configs/datasets/MathBench/mathbench_2024_gen_50a320.py b/configs/datasets/MathBench/mathbench_2024_gen_50a320.py new file mode 100644 index 00000000..2d20f450 --- /dev/null +++ b/configs/datasets/MathBench/mathbench_2024_gen_50a320.py @@ -0,0 +1,81 @@ +from mmengine.config import read_base +from copy import deepcopy +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import MathBenchDataset, math_postprocess_v2 +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets + +# Max for this dataset is 4 +num_shot = 0 +# Generate reasoning path or not, only for single choice +with_reasoning = True +# Use circular evaluation or not +with_circular_eval = True +# Use PPL mode in single choice test or not +use_ppl_single_choice = False + +assert 0 <= num_shot <= 4 +if num_shot == 0: + prompts = zero_shot_prompts +else: + prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()} + +mathbench_datasets = [] +for _split in mathbench_sets: + for _name in mathbench_sets[_split]: + if 'single_choice' in _name: + if with_reasoning: + template_round = prompts[_name + '_with_reasoning'] + else: + template_round = prompts[_name] + else: + template_round = prompts[_name] + + if 'single_choice' in _name: + pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') + else: + pred_postprocessor = dict(type=math_postprocess_v2) + + if 'single_choice' in _name and with_circular_eval: + evaluator = dict(type=CircularEvaluator) + else: + evaluator = dict(type=AccEvaluator) + + # assemble the final config + mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = deepcopy(template_round) + one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), + ) + mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor) + + mathbench_datasets.append( + dict( + abbr='mathbench-' + _split + '-' + _name, + type=MathBenchDataset, + path=f'data/mathbench_v1/{_split}', + name=_name, + with_circular=with_circular_eval, + reader_cfg=mathbench_reader_cfg, + infer_cfg=mathbench_infer_cfg, + eval_cfg=mathbench_eval_cfg, + ) + ) diff --git a/configs/datasets/MathBench/mathbench_prompt.py b/configs/datasets/MathBench/mathbench_prompt.py index 069528ee..8052dab6 100644 --- a/configs/datasets/MathBench/mathbench_prompt.py +++ b/configs/datasets/MathBench/mathbench_prompt.py @@ -11,6 +11,12 @@ zero_shot_prompts = { 'single_choice_en': [ dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'), ], + 'cloze_en': [ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + 'cloze_cn': [ + dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'), + ] } few_shot_prompts = { diff --git a/configs/datasets/wikibench/wikibench_gen_0978ad.py b/configs/datasets/wikibench/wikibench_gen_0978ad.py new file mode 100644 index 00000000..871133f9 --- /dev/null +++ b/configs/datasets/wikibench/wikibench_gen_0978ad.py @@ -0,0 +1,56 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import WikiBenchDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + + +single_choice_prompts = { + 'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识一步步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:', +} + +wikibench_sets = { + 'wiki': ['single_choice_cn'], +} + +do_circular = True + +wikibench_datasets = [] + +for _split in list(wikibench_sets.keys()): + for _name in wikibench_sets[_split]: + wikibench_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict(role='HUMAN', prompt=single_choice_prompts[_name]), + dict(role='BOT', prompt='{answer}'), + ], + ), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + wikibench_eval_cfg = dict( + evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + ) + + wikibench_datasets.append( + dict( + type=WikiBenchDataset, + path=f'./data/WikiBench/{_name}.jsonl', + name='circular_' + _name if do_circular else _name, + abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '', + reader_cfg=dict( + input_columns=['question'], + output_column='answer', + ), + infer_cfg=wikibench_infer_cfg, + eval_cfg=wikibench_eval_cfg, + ) + ) diff --git a/opencompass/configs/datasets/MathBench/mathbench_2024_gen_50a320.py b/opencompass/configs/datasets/MathBench/mathbench_2024_gen_50a320.py new file mode 100644 index 00000000..2d20f450 --- /dev/null +++ b/opencompass/configs/datasets/MathBench/mathbench_2024_gen_50a320.py @@ -0,0 +1,81 @@ +from mmengine.config import read_base +from copy import deepcopy +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import MathBenchDataset, math_postprocess_v2 +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets + +# Max for this dataset is 4 +num_shot = 0 +# Generate reasoning path or not, only for single choice +with_reasoning = True +# Use circular evaluation or not +with_circular_eval = True +# Use PPL mode in single choice test or not +use_ppl_single_choice = False + +assert 0 <= num_shot <= 4 +if num_shot == 0: + prompts = zero_shot_prompts +else: + prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()} + +mathbench_datasets = [] +for _split in mathbench_sets: + for _name in mathbench_sets[_split]: + if 'single_choice' in _name: + if with_reasoning: + template_round = prompts[_name + '_with_reasoning'] + else: + template_round = prompts[_name] + else: + template_round = prompts[_name] + + if 'single_choice' in _name: + pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') + else: + pred_postprocessor = dict(type=math_postprocess_v2) + + if 'single_choice' in _name and with_circular_eval: + evaluator = dict(type=CircularEvaluator) + else: + evaluator = dict(type=AccEvaluator) + + # assemble the final config + mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = deepcopy(template_round) + one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), + ) + mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor) + + mathbench_datasets.append( + dict( + abbr='mathbench-' + _split + '-' + _name, + type=MathBenchDataset, + path=f'data/mathbench_v1/{_split}', + name=_name, + with_circular=with_circular_eval, + reader_cfg=mathbench_reader_cfg, + infer_cfg=mathbench_infer_cfg, + eval_cfg=mathbench_eval_cfg, + ) + ) diff --git a/opencompass/configs/datasets/MathBench/mathbench_prompt.py b/opencompass/configs/datasets/MathBench/mathbench_prompt.py index 069528ee..8052dab6 100644 --- a/opencompass/configs/datasets/MathBench/mathbench_prompt.py +++ b/opencompass/configs/datasets/MathBench/mathbench_prompt.py @@ -11,6 +11,12 @@ zero_shot_prompts = { 'single_choice_en': [ dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'), ], + 'cloze_en': [ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + 'cloze_cn': [ + dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'), + ] } few_shot_prompts = { diff --git a/opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_2c9cd6.py b/opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_2c9cd6.py new file mode 100644 index 00000000..3c06d7a4 --- /dev/null +++ b/opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_2c9cd6.py @@ -0,0 +1,49 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import GPQADataset, GPQAEvaluator +from opencompass.utils import first_option_postprocess + +gpqa_reader_cfg = dict( + input_columns=['question', 'A', 'B', 'C', 'D'], + output_column='answer') + +hint = f'对下面的单项选择题,请直接给出正确答案的选项。' +question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n' +gpqa_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template={ + opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']}, + ), + prompt_template=dict( + type=PromptTemplate, + template={ + opt: f'{hint}\n{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D'] + }, + ice_token='' + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=PPLInferencer)) + +gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator)) + +gpqa_datasets = [] +gpqa_subsets = { + # 'extended': 'gpqa_extended.csv', + # 'main': 'gpqa_main.csv', + 'diamond': 'gpqa_diamond.csv' +} + +for split in list(gpqa_subsets.keys()): + gpqa_datasets.append( + dict( + abbr='GPQA_' + split, + type=GPQADataset, + path='./data/gpqa/', + name=gpqa_subsets[split], + reader_cfg=gpqa_reader_cfg, + infer_cfg=gpqa_infer_cfg, + eval_cfg=gpqa_eval_cfg) + ) diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py new file mode 100644 index 00000000..dc12fd1d --- /dev/null +++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py @@ -0,0 +1,47 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator + +with read_base(): + from .mmlu_pro_categories import categories + +mmlu_pro_datasets = [] + +for category in categories: + hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.' + question_and_options = 'Question:\n{question}\nOptions:\n{options_str}' + mmlu_pro_reader_cfg = dict( + input_columns=['question', 'cot_content', 'options_str'], + output_column='answer_string', + train_split='validation', + test_split='test', + ) + mmlu_pro_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=f'{question_and_options}\nAnswer: {{answer}}'), + prompt_template=dict( + type=PromptTemplate, + template=f'{hint}\n{question_and_options}\nAnswer: ', + ice_token='' + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer, max_out_len=100) + ) + + mmlu_pro_eval_cfg = dict( + evaluator=dict(type=MMLUProBaseEvaluator) + ) + + mmlu_pro_datasets.append( + dict( + abbr=f'mmlu_pro_{category.replace(" ", "_")}', + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + infer_cfg=mmlu_pro_infer_cfg, + eval_cfg=mmlu_pro_eval_cfg, + )) diff --git a/opencompass/configs/datasets/wikibench/wikibench_gen_0978ad.py b/opencompass/configs/datasets/wikibench/wikibench_gen_0978ad.py new file mode 100644 index 00000000..871133f9 --- /dev/null +++ b/opencompass/configs/datasets/wikibench/wikibench_gen_0978ad.py @@ -0,0 +1,56 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import WikiBenchDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + + +single_choice_prompts = { + 'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识一步步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:', +} + +wikibench_sets = { + 'wiki': ['single_choice_cn'], +} + +do_circular = True + +wikibench_datasets = [] + +for _split in list(wikibench_sets.keys()): + for _name in wikibench_sets[_split]: + wikibench_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict(role='HUMAN', prompt=single_choice_prompts[_name]), + dict(role='BOT', prompt='{answer}'), + ], + ), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + wikibench_eval_cfg = dict( + evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + ) + + wikibench_datasets.append( + dict( + type=WikiBenchDataset, + path=f'./data/WikiBench/{_name}.jsonl', + name='circular_' + _name if do_circular else _name, + abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '', + reader_cfg=dict( + input_columns=['question'], + output_column='answer', + ), + infer_cfg=wikibench_infer_cfg, + eval_cfg=wikibench_eval_cfg, + ) + ) diff --git a/opencompass/datasets/mmlu_pro.py b/opencompass/datasets/mmlu_pro.py index d5e2a4e4..0f2957b8 100644 --- a/opencompass/datasets/mmlu_pro.py +++ b/opencompass/datasets/mmlu_pro.py @@ -3,19 +3,26 @@ from datasets import load_dataset +from opencompass.openicl import BaseEvaluator from opencompass.registry import LOAD_DATASET from opencompass.utils import get_data_path from .base import BaseDataset +CHOICES=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'] def _parse(item): - choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'] + s = '' + item['answer_string'] = '' for i, opt in enumerate(item['options']): if opt == 'N/A': continue - s += '{}. {}\n'.format(choices[i], opt) + option = '{}. {}\n'.format(CHOICES[i], opt) + s += option + if item['answer'] == CHOICES[i]: + item['answer_string'] = option + item['options_str'] = s.strip() item['cot_content'] = item['cot_content'].removeprefix("A: Let's think step by step.").strip() return item @@ -31,3 +38,38 @@ class MMLUProDataset(BaseDataset): mmlu_pro = mmlu_pro.filter(lambda x: x['category'] == category) mmlu_pro = mmlu_pro.map(_parse) return mmlu_pro + +class MMLUProBaseEvaluator(BaseEvaluator): + + def is_equal(self, pred, refer): + try: + refer_option, refer_string = refer.split('. ') + if pred in CHOICES and refer_option == pred: + return True + elif refer_string.strip() == pred: + return True + else : + return False + except Exception: + pass + return False + + def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + 'length' + } + correct = 0 + count = 0 + details = [] + for i, j in zip(predictions, references): + i = i.split('\n')[0].strip() + detail = {'pred': i, 'answer': j, 'correct': False} + count += 1 + if self.is_equal(i, j): + correct += 1 + detail['correct'] = True + details.append(detail) + result = {'accuracy': 100 * correct / count, 'details': details} + return result