mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Update MathBench & WikiBench for FullBench (#1521)
* Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench * Update GPQA & MMLU_Pro * Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench --------- Co-authored-by: liushz <liuhongwei@pjlab.rog.cn>
This commit is contained in:
parent
cfbd308edf
commit
c9a7026f59
81
configs/datasets/MathBench/mathbench_2024_gen_50a320.py
Normal file
81
configs/datasets/MathBench/mathbench_2024_gen_50a320.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
from copy import deepcopy
|
||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||||
|
from opencompass.datasets import MathBenchDataset, math_postprocess_v2
|
||||||
|
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
|
||||||
|
|
||||||
|
# Max for this dataset is 4
|
||||||
|
num_shot = 0
|
||||||
|
# Generate reasoning path or not, only for single choice
|
||||||
|
with_reasoning = True
|
||||||
|
# Use circular evaluation or not
|
||||||
|
with_circular_eval = True
|
||||||
|
# Use PPL mode in single choice test or not
|
||||||
|
use_ppl_single_choice = False
|
||||||
|
|
||||||
|
assert 0 <= num_shot <= 4
|
||||||
|
if num_shot == 0:
|
||||||
|
prompts = zero_shot_prompts
|
||||||
|
else:
|
||||||
|
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
|
||||||
|
|
||||||
|
mathbench_datasets = []
|
||||||
|
for _split in mathbench_sets:
|
||||||
|
for _name in mathbench_sets[_split]:
|
||||||
|
if 'single_choice' in _name:
|
||||||
|
if with_reasoning:
|
||||||
|
template_round = prompts[_name + '_with_reasoning']
|
||||||
|
else:
|
||||||
|
template_round = prompts[_name]
|
||||||
|
else:
|
||||||
|
template_round = prompts[_name]
|
||||||
|
|
||||||
|
if 'single_choice' in _name:
|
||||||
|
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
|
||||||
|
else:
|
||||||
|
pred_postprocessor = dict(type=math_postprocess_v2)
|
||||||
|
|
||||||
|
if 'single_choice' in _name and with_circular_eval:
|
||||||
|
evaluator = dict(type=CircularEvaluator)
|
||||||
|
else:
|
||||||
|
evaluator = dict(type=AccEvaluator)
|
||||||
|
|
||||||
|
# assemble the final config
|
||||||
|
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||||
|
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
|
||||||
|
template = {}
|
||||||
|
for answer in ['A', 'B', 'C', 'D']:
|
||||||
|
one_template_round = deepcopy(template_round)
|
||||||
|
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
|
||||||
|
template[answer] = dict(round=one_template_round)
|
||||||
|
mathbench_infer_cfg = dict(
|
||||||
|
prompt_template=dict(type=PromptTemplate, template=template),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=PPLInferencer),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
mathbench_infer_cfg = dict(
|
||||||
|
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||||
|
)
|
||||||
|
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
|
||||||
|
|
||||||
|
mathbench_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr='mathbench-' + _split + '-' + _name,
|
||||||
|
type=MathBenchDataset,
|
||||||
|
path=f'data/mathbench_v1/{_split}',
|
||||||
|
name=_name,
|
||||||
|
with_circular=with_circular_eval,
|
||||||
|
reader_cfg=mathbench_reader_cfg,
|
||||||
|
infer_cfg=mathbench_infer_cfg,
|
||||||
|
eval_cfg=mathbench_eval_cfg,
|
||||||
|
)
|
||||||
|
)
|
@ -11,6 +11,12 @@ zero_shot_prompts = {
|
|||||||
'single_choice_en': [
|
'single_choice_en': [
|
||||||
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
|
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
|
||||||
],
|
],
|
||||||
|
'cloze_en': [
|
||||||
|
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
|
||||||
|
],
|
||||||
|
'cloze_cn': [
|
||||||
|
dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'),
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
few_shot_prompts = {
|
few_shot_prompts = {
|
||||||
|
56
configs/datasets/wikibench/wikibench_gen_0978ad.py
Normal file
56
configs/datasets/wikibench/wikibench_gen_0978ad.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||||
|
from opencompass.datasets import WikiBenchDataset
|
||||||
|
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||||
|
|
||||||
|
|
||||||
|
single_choice_prompts = {
|
||||||
|
'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识一步步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:',
|
||||||
|
}
|
||||||
|
|
||||||
|
wikibench_sets = {
|
||||||
|
'wiki': ['single_choice_cn'],
|
||||||
|
}
|
||||||
|
|
||||||
|
do_circular = True
|
||||||
|
|
||||||
|
wikibench_datasets = []
|
||||||
|
|
||||||
|
for _split in list(wikibench_sets.keys()):
|
||||||
|
for _name in wikibench_sets[_split]:
|
||||||
|
wikibench_infer_cfg = dict(
|
||||||
|
ice_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
begin='</E>',
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', prompt=single_choice_prompts[_name]),
|
||||||
|
dict(role='BOT', prompt='{answer}'),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
ice_token='</E>',
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
wikibench_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator),
|
||||||
|
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
||||||
|
)
|
||||||
|
|
||||||
|
wikibench_datasets.append(
|
||||||
|
dict(
|
||||||
|
type=WikiBenchDataset,
|
||||||
|
path=f'./data/WikiBench/{_name}.jsonl',
|
||||||
|
name='circular_' + _name if do_circular else _name,
|
||||||
|
abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '',
|
||||||
|
reader_cfg=dict(
|
||||||
|
input_columns=['question'],
|
||||||
|
output_column='answer',
|
||||||
|
),
|
||||||
|
infer_cfg=wikibench_infer_cfg,
|
||||||
|
eval_cfg=wikibench_eval_cfg,
|
||||||
|
)
|
||||||
|
)
|
@ -0,0 +1,81 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
from copy import deepcopy
|
||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||||
|
from opencompass.datasets import MathBenchDataset, math_postprocess_v2
|
||||||
|
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
|
||||||
|
|
||||||
|
# Max for this dataset is 4
|
||||||
|
num_shot = 0
|
||||||
|
# Generate reasoning path or not, only for single choice
|
||||||
|
with_reasoning = True
|
||||||
|
# Use circular evaluation or not
|
||||||
|
with_circular_eval = True
|
||||||
|
# Use PPL mode in single choice test or not
|
||||||
|
use_ppl_single_choice = False
|
||||||
|
|
||||||
|
assert 0 <= num_shot <= 4
|
||||||
|
if num_shot == 0:
|
||||||
|
prompts = zero_shot_prompts
|
||||||
|
else:
|
||||||
|
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
|
||||||
|
|
||||||
|
mathbench_datasets = []
|
||||||
|
for _split in mathbench_sets:
|
||||||
|
for _name in mathbench_sets[_split]:
|
||||||
|
if 'single_choice' in _name:
|
||||||
|
if with_reasoning:
|
||||||
|
template_round = prompts[_name + '_with_reasoning']
|
||||||
|
else:
|
||||||
|
template_round = prompts[_name]
|
||||||
|
else:
|
||||||
|
template_round = prompts[_name]
|
||||||
|
|
||||||
|
if 'single_choice' in _name:
|
||||||
|
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
|
||||||
|
else:
|
||||||
|
pred_postprocessor = dict(type=math_postprocess_v2)
|
||||||
|
|
||||||
|
if 'single_choice' in _name and with_circular_eval:
|
||||||
|
evaluator = dict(type=CircularEvaluator)
|
||||||
|
else:
|
||||||
|
evaluator = dict(type=AccEvaluator)
|
||||||
|
|
||||||
|
# assemble the final config
|
||||||
|
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||||
|
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
|
||||||
|
template = {}
|
||||||
|
for answer in ['A', 'B', 'C', 'D']:
|
||||||
|
one_template_round = deepcopy(template_round)
|
||||||
|
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
|
||||||
|
template[answer] = dict(round=one_template_round)
|
||||||
|
mathbench_infer_cfg = dict(
|
||||||
|
prompt_template=dict(type=PromptTemplate, template=template),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=PPLInferencer),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
mathbench_infer_cfg = dict(
|
||||||
|
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||||
|
)
|
||||||
|
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
|
||||||
|
|
||||||
|
mathbench_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr='mathbench-' + _split + '-' + _name,
|
||||||
|
type=MathBenchDataset,
|
||||||
|
path=f'data/mathbench_v1/{_split}',
|
||||||
|
name=_name,
|
||||||
|
with_circular=with_circular_eval,
|
||||||
|
reader_cfg=mathbench_reader_cfg,
|
||||||
|
infer_cfg=mathbench_infer_cfg,
|
||||||
|
eval_cfg=mathbench_eval_cfg,
|
||||||
|
)
|
||||||
|
)
|
@ -11,6 +11,12 @@ zero_shot_prompts = {
|
|||||||
'single_choice_en': [
|
'single_choice_en': [
|
||||||
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
|
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
|
||||||
],
|
],
|
||||||
|
'cloze_en': [
|
||||||
|
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
|
||||||
|
],
|
||||||
|
'cloze_cn': [
|
||||||
|
dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'),
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
few_shot_prompts = {
|
few_shot_prompts = {
|
||||||
|
@ -0,0 +1,49 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import PPLInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
||||||
|
from opencompass.datasets import GPQADataset, GPQAEvaluator
|
||||||
|
from opencompass.utils import first_option_postprocess
|
||||||
|
|
||||||
|
gpqa_reader_cfg = dict(
|
||||||
|
input_columns=['question', 'A', 'B', 'C', 'D'],
|
||||||
|
output_column='answer')
|
||||||
|
|
||||||
|
hint = f'对下面的单项选择题,请直接给出正确答案的选项。'
|
||||||
|
question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
|
||||||
|
gpqa_infer_cfg = dict(
|
||||||
|
ice_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template={
|
||||||
|
opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
|
||||||
|
),
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template={
|
||||||
|
opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
|
||||||
|
},
|
||||||
|
ice_token='</E>'
|
||||||
|
),
|
||||||
|
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||||
|
inferencer=dict(type=PPLInferencer))
|
||||||
|
|
||||||
|
gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
|
||||||
|
|
||||||
|
gpqa_datasets = []
|
||||||
|
gpqa_subsets = {
|
||||||
|
# 'extended': 'gpqa_extended.csv',
|
||||||
|
# 'main': 'gpqa_main.csv',
|
||||||
|
'diamond': 'gpqa_diamond.csv'
|
||||||
|
}
|
||||||
|
|
||||||
|
for split in list(gpqa_subsets.keys()):
|
||||||
|
gpqa_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr='GPQA_' + split,
|
||||||
|
type=GPQADataset,
|
||||||
|
path='./data/gpqa/',
|
||||||
|
name=gpqa_subsets[split],
|
||||||
|
reader_cfg=gpqa_reader_cfg,
|
||||||
|
infer_cfg=gpqa_infer_cfg,
|
||||||
|
eval_cfg=gpqa_eval_cfg)
|
||||||
|
)
|
@ -0,0 +1,47 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .mmlu_pro_categories import categories
|
||||||
|
|
||||||
|
mmlu_pro_datasets = []
|
||||||
|
|
||||||
|
for category in categories:
|
||||||
|
hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.'
|
||||||
|
question_and_options = 'Question:\n{question}\nOptions:\n{options_str}'
|
||||||
|
mmlu_pro_reader_cfg = dict(
|
||||||
|
input_columns=['question', 'cot_content', 'options_str'],
|
||||||
|
output_column='answer_string',
|
||||||
|
train_split='validation',
|
||||||
|
test_split='test',
|
||||||
|
)
|
||||||
|
mmlu_pro_infer_cfg = dict(
|
||||||
|
ice_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=f'{question_and_options}\nAnswer: {{answer}}'),
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=f'{hint}\n</E>{question_and_options}\nAnswer: ',
|
||||||
|
ice_token='</E>'
|
||||||
|
),
|
||||||
|
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=100)
|
||||||
|
)
|
||||||
|
|
||||||
|
mmlu_pro_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=MMLUProBaseEvaluator)
|
||||||
|
)
|
||||||
|
|
||||||
|
mmlu_pro_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f'mmlu_pro_{category.replace(" ", "_")}',
|
||||||
|
type=MMLUProDataset,
|
||||||
|
path='opencompass/mmlu_pro',
|
||||||
|
category=category,
|
||||||
|
reader_cfg=mmlu_pro_reader_cfg,
|
||||||
|
infer_cfg=mmlu_pro_infer_cfg,
|
||||||
|
eval_cfg=mmlu_pro_eval_cfg,
|
||||||
|
))
|
@ -0,0 +1,56 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||||
|
from opencompass.datasets import WikiBenchDataset
|
||||||
|
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||||
|
|
||||||
|
|
||||||
|
single_choice_prompts = {
|
||||||
|
'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识一步步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:',
|
||||||
|
}
|
||||||
|
|
||||||
|
wikibench_sets = {
|
||||||
|
'wiki': ['single_choice_cn'],
|
||||||
|
}
|
||||||
|
|
||||||
|
do_circular = True
|
||||||
|
|
||||||
|
wikibench_datasets = []
|
||||||
|
|
||||||
|
for _split in list(wikibench_sets.keys()):
|
||||||
|
for _name in wikibench_sets[_split]:
|
||||||
|
wikibench_infer_cfg = dict(
|
||||||
|
ice_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
begin='</E>',
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', prompt=single_choice_prompts[_name]),
|
||||||
|
dict(role='BOT', prompt='{answer}'),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
ice_token='</E>',
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
wikibench_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator),
|
||||||
|
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
||||||
|
)
|
||||||
|
|
||||||
|
wikibench_datasets.append(
|
||||||
|
dict(
|
||||||
|
type=WikiBenchDataset,
|
||||||
|
path=f'./data/WikiBench/{_name}.jsonl',
|
||||||
|
name='circular_' + _name if do_circular else _name,
|
||||||
|
abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '',
|
||||||
|
reader_cfg=dict(
|
||||||
|
input_columns=['question'],
|
||||||
|
output_column='answer',
|
||||||
|
),
|
||||||
|
infer_cfg=wikibench_infer_cfg,
|
||||||
|
eval_cfg=wikibench_eval_cfg,
|
||||||
|
)
|
||||||
|
)
|
@ -3,19 +3,26 @@
|
|||||||
|
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
from opencompass.openicl import BaseEvaluator
|
||||||
from opencompass.registry import LOAD_DATASET
|
from opencompass.registry import LOAD_DATASET
|
||||||
from opencompass.utils import get_data_path
|
from opencompass.utils import get_data_path
|
||||||
|
|
||||||
from .base import BaseDataset
|
from .base import BaseDataset
|
||||||
|
|
||||||
|
CHOICES=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
|
||||||
|
|
||||||
def _parse(item):
|
def _parse(item):
|
||||||
choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
|
|
||||||
s = ''
|
s = ''
|
||||||
|
item['answer_string'] = ''
|
||||||
for i, opt in enumerate(item['options']):
|
for i, opt in enumerate(item['options']):
|
||||||
if opt == 'N/A':
|
if opt == 'N/A':
|
||||||
continue
|
continue
|
||||||
s += '{}. {}\n'.format(choices[i], opt)
|
option = '{}. {}\n'.format(CHOICES[i], opt)
|
||||||
|
s += option
|
||||||
|
if item['answer'] == CHOICES[i]:
|
||||||
|
item['answer_string'] = option
|
||||||
|
|
||||||
item['options_str'] = s.strip()
|
item['options_str'] = s.strip()
|
||||||
item['cot_content'] = item['cot_content'].removeprefix("A: Let's think step by step.").strip()
|
item['cot_content'] = item['cot_content'].removeprefix("A: Let's think step by step.").strip()
|
||||||
return item
|
return item
|
||||||
@ -31,3 +38,38 @@ class MMLUProDataset(BaseDataset):
|
|||||||
mmlu_pro = mmlu_pro.filter(lambda x: x['category'] == category)
|
mmlu_pro = mmlu_pro.filter(lambda x: x['category'] == category)
|
||||||
mmlu_pro = mmlu_pro.map(_parse)
|
mmlu_pro = mmlu_pro.map(_parse)
|
||||||
return mmlu_pro
|
return mmlu_pro
|
||||||
|
|
||||||
|
class MMLUProBaseEvaluator(BaseEvaluator):
|
||||||
|
|
||||||
|
def is_equal(self, pred, refer):
|
||||||
|
try:
|
||||||
|
refer_option, refer_string = refer.split('. ')
|
||||||
|
if pred in CHOICES and refer_option == pred:
|
||||||
|
return True
|
||||||
|
elif refer_string.strip() == pred:
|
||||||
|
return True
|
||||||
|
else :
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
def score(self, predictions, references):
|
||||||
|
if len(predictions) != len(references):
|
||||||
|
return {
|
||||||
|
'error': 'predictions and references have different '
|
||||||
|
'length'
|
||||||
|
}
|
||||||
|
correct = 0
|
||||||
|
count = 0
|
||||||
|
details = []
|
||||||
|
for i, j in zip(predictions, references):
|
||||||
|
i = i.split('\n')[0].strip()
|
||||||
|
detail = {'pred': i, 'answer': j, 'correct': False}
|
||||||
|
count += 1
|
||||||
|
if self.is_equal(i, j):
|
||||||
|
correct += 1
|
||||||
|
detail['correct'] = True
|
||||||
|
details.append(detail)
|
||||||
|
result = {'accuracy': 100 * correct / count, 'details': details}
|
||||||
|
return result
|
||||||
|
Loading…
Reference in New Issue
Block a user