[Feature] Update MathBench & WikiBench for FullBench (#1521)

* Update MathBench & WikiBench for FullBench

* Update MathBench & WikiBench for FullBench

* Update GPQA & MMLU_Pro

* Update MathBench & WikiBench for FullBench

* Update MathBench & WikiBench for FullBench

* Update MathBench & WikiBench for FullBench

---------

Co-authored-by: liushz <liuhongwei@pjlab.rog.cn>
This commit is contained in:
liushz 2024-09-18 14:35:30 +08:00 committed by GitHub
parent cfbd308edf
commit c9a7026f59
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 426 additions and 2 deletions

View File

@ -0,0 +1,81 @@
from mmengine.config import read_base
from copy import deepcopy
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, math_postprocess_v2
from opencompass.utils.text_postprocessors import first_option_postprocess
with read_base():
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
# Max for this dataset is 4
num_shot = 0
# Generate reasoning path or not, only for single choice
with_reasoning = True
# Use circular evaluation or not
with_circular_eval = True
# Use PPL mode in single choice test or not
use_ppl_single_choice = False
assert 0 <= num_shot <= 4
if num_shot == 0:
prompts = zero_shot_prompts
else:
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
mathbench_datasets = []
for _split in mathbench_sets:
for _name in mathbench_sets[_split]:
if 'single_choice' in _name:
if with_reasoning:
template_round = prompts[_name + '_with_reasoning']
else:
template_round = prompts[_name]
else:
template_round = prompts[_name]
if 'single_choice' in _name:
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
else:
pred_postprocessor = dict(type=math_postprocess_v2)
if 'single_choice' in _name and with_circular_eval:
evaluator = dict(type=CircularEvaluator)
else:
evaluator = dict(type=AccEvaluator)
# assemble the final config
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
template = {}
for answer in ['A', 'B', 'C', 'D']:
one_template_round = deepcopy(template_round)
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
template[answer] = dict(round=one_template_round)
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=template),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
else:
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
mathbench_datasets.append(
dict(
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f'data/mathbench_v1/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=mathbench_reader_cfg,
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
)
)

View File

@ -11,6 +11,12 @@ zero_shot_prompts = {
'single_choice_en': [ 'single_choice_en': [
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'), dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
], ],
'cloze_en': [
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
],
'cloze_cn': [
dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'),
]
} }
few_shot_prompts = { few_shot_prompts = {

View File

@ -0,0 +1,56 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import WikiBenchDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
'single_choice_cn': '以下是一道单项选择题请你根据你了解的知识一步步推理并在最后用“所以答案为选项X”给出答案其中“X”为选项ABCD中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:',
}
wikibench_sets = {
'wiki': ['single_choice_cn'],
}
do_circular = True
wikibench_datasets = []
for _split in list(wikibench_sets.keys()):
for _name in wikibench_sets[_split]:
wikibench_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(role='HUMAN', prompt=single_choice_prompts[_name]),
dict(role='BOT', prompt='{answer}'),
],
),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
wikibench_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
)
wikibench_datasets.append(
dict(
type=WikiBenchDataset,
path=f'./data/WikiBench/{_name}.jsonl',
name='circular_' + _name if do_circular else _name,
abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '',
reader_cfg=dict(
input_columns=['question'],
output_column='answer',
),
infer_cfg=wikibench_infer_cfg,
eval_cfg=wikibench_eval_cfg,
)
)

View File

@ -0,0 +1,81 @@
from mmengine.config import read_base
from copy import deepcopy
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, math_postprocess_v2
from opencompass.utils.text_postprocessors import first_option_postprocess
with read_base():
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
# Max for this dataset is 4
num_shot = 0
# Generate reasoning path or not, only for single choice
with_reasoning = True
# Use circular evaluation or not
with_circular_eval = True
# Use PPL mode in single choice test or not
use_ppl_single_choice = False
assert 0 <= num_shot <= 4
if num_shot == 0:
prompts = zero_shot_prompts
else:
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
mathbench_datasets = []
for _split in mathbench_sets:
for _name in mathbench_sets[_split]:
if 'single_choice' in _name:
if with_reasoning:
template_round = prompts[_name + '_with_reasoning']
else:
template_round = prompts[_name]
else:
template_round = prompts[_name]
if 'single_choice' in _name:
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
else:
pred_postprocessor = dict(type=math_postprocess_v2)
if 'single_choice' in _name and with_circular_eval:
evaluator = dict(type=CircularEvaluator)
else:
evaluator = dict(type=AccEvaluator)
# assemble the final config
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
template = {}
for answer in ['A', 'B', 'C', 'D']:
one_template_round = deepcopy(template_round)
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
template[answer] = dict(round=one_template_round)
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=template),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
else:
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
mathbench_datasets.append(
dict(
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f'data/mathbench_v1/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=mathbench_reader_cfg,
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
)
)

View File

@ -11,6 +11,12 @@ zero_shot_prompts = {
'single_choice_en': [ 'single_choice_en': [
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'), dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
], ],
'cloze_en': [
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
],
'cloze_cn': [
dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'),
]
} }
few_shot_prompts = { few_shot_prompts = {

View File

@ -0,0 +1,49 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import GPQADataset, GPQAEvaluator
from opencompass.utils import first_option_postprocess
gpqa_reader_cfg = dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer')
hint = f'对下面的单项选择题,请直接给出正确答案的选项。'
question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
gpqa_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={
opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
),
prompt_template=dict(
type=PromptTemplate,
template={
opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
},
ice_token='</E>'
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer))
gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
gpqa_datasets = []
gpqa_subsets = {
# 'extended': 'gpqa_extended.csv',
# 'main': 'gpqa_main.csv',
'diamond': 'gpqa_diamond.csv'
}
for split in list(gpqa_subsets.keys()):
gpqa_datasets.append(
dict(
abbr='GPQA_' + split,
type=GPQADataset,
path='./data/gpqa/',
name=gpqa_subsets[split],
reader_cfg=gpqa_reader_cfg,
infer_cfg=gpqa_infer_cfg,
eval_cfg=gpqa_eval_cfg)
)

View File

@ -0,0 +1,47 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator
with read_base():
from .mmlu_pro_categories import categories
mmlu_pro_datasets = []
for category in categories:
hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.'
question_and_options = 'Question:\n{question}\nOptions:\n{options_str}'
mmlu_pro_reader_cfg = dict(
input_columns=['question', 'cot_content', 'options_str'],
output_column='answer_string',
train_split='validation',
test_split='test',
)
mmlu_pro_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=f'{question_and_options}\nAnswer: {{answer}}'),
prompt_template=dict(
type=PromptTemplate,
template=f'{hint}\n</E>{question_and_options}\nAnswer: ',
ice_token='</E>'
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer, max_out_len=100)
)
mmlu_pro_eval_cfg = dict(
evaluator=dict(type=MMLUProBaseEvaluator)
)
mmlu_pro_datasets.append(
dict(
abbr=f'mmlu_pro_{category.replace(" ", "_")}',
type=MMLUProDataset,
path='opencompass/mmlu_pro',
category=category,
reader_cfg=mmlu_pro_reader_cfg,
infer_cfg=mmlu_pro_infer_cfg,
eval_cfg=mmlu_pro_eval_cfg,
))

View File

@ -0,0 +1,56 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import WikiBenchDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
'single_choice_cn': '以下是一道单项选择题请你根据你了解的知识一步步推理并在最后用“所以答案为选项X”给出答案其中“X”为选项ABCD中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:',
}
wikibench_sets = {
'wiki': ['single_choice_cn'],
}
do_circular = True
wikibench_datasets = []
for _split in list(wikibench_sets.keys()):
for _name in wikibench_sets[_split]:
wikibench_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(role='HUMAN', prompt=single_choice_prompts[_name]),
dict(role='BOT', prompt='{answer}'),
],
),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
wikibench_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
)
wikibench_datasets.append(
dict(
type=WikiBenchDataset,
path=f'./data/WikiBench/{_name}.jsonl',
name='circular_' + _name if do_circular else _name,
abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '',
reader_cfg=dict(
input_columns=['question'],
output_column='answer',
),
infer_cfg=wikibench_infer_cfg,
eval_cfg=wikibench_eval_cfg,
)
)

View File

@ -3,19 +3,26 @@
from datasets import load_dataset from datasets import load_dataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import LOAD_DATASET from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path from opencompass.utils import get_data_path
from .base import BaseDataset from .base import BaseDataset
CHOICES=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
def _parse(item): def _parse(item):
choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
s = '' s = ''
item['answer_string'] = ''
for i, opt in enumerate(item['options']): for i, opt in enumerate(item['options']):
if opt == 'N/A': if opt == 'N/A':
continue continue
s += '{}. {}\n'.format(choices[i], opt) option = '{}. {}\n'.format(CHOICES[i], opt)
s += option
if item['answer'] == CHOICES[i]:
item['answer_string'] = option
item['options_str'] = s.strip() item['options_str'] = s.strip()
item['cot_content'] = item['cot_content'].removeprefix("A: Let's think step by step.").strip() item['cot_content'] = item['cot_content'].removeprefix("A: Let's think step by step.").strip()
return item return item
@ -31,3 +38,38 @@ class MMLUProDataset(BaseDataset):
mmlu_pro = mmlu_pro.filter(lambda x: x['category'] == category) mmlu_pro = mmlu_pro.filter(lambda x: x['category'] == category)
mmlu_pro = mmlu_pro.map(_parse) mmlu_pro = mmlu_pro.map(_parse)
return mmlu_pro return mmlu_pro
class MMLUProBaseEvaluator(BaseEvaluator):
def is_equal(self, pred, refer):
try:
refer_option, refer_string = refer.split('. ')
if pred in CHOICES and refer_option == pred:
return True
elif refer_string.strip() == pred:
return True
else :
return False
except Exception:
pass
return False
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
correct = 0
count = 0
details = []
for i, j in zip(predictions, references):
i = i.split('\n')[0].strip()
detail = {'pred': i, 'answer': j, 'correct': False}
count += 1
if self.is_equal(i, j):
correct += 1
detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result